% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RogueTaxa.R, R/SPIC.R, R/zz_RogueNaRok.R
\name{RogueTaxa}
\alias{RogueTaxa}
\alias{QuickRogue}
\alias{C_RogueNaRok}
\title{Drop rogue taxa to generate a more informative consensus}
\usage{
RogueTaxa(
  trees,
  info = c("spic", "scic", "fspic", "fscic", "rbic"),
  return = c("taxa", "tree"),
  bestTree = NULL,
  computeSupport = TRUE,
  dropsetSize = 1,
  neverDrop = character(0),
  labelPenalty = 0,
  mreOptimization = FALSE,
  threshold = 50,
  verbose = FALSE
)

QuickRogue(
  trees,
  info = "phylogenetic",
  p = 0.5,
  log = TRUE,
  average = "median",
  deviation = "mad",
  neverDrop,
  fullSeq = FALSE
)

C_RogueNaRok(
  bootTrees = "",
  runId = "tmp",
  treeFile = "",
  computeSupport = TRUE,
  dropsetSize = 1,
  excludeFile = "",
  workDir = "",
  labelPenalty = 0,
  mreOptimization = FALSE,
  threshold = 50
)
}
\arguments{
\item{trees}{List of trees to analyse.}

\item{info}{Concept of information to employ; see details.}

\item{return}{If \code{taxa}, returns the leaves identified as rogues; if \code{tree},
return a consensus tree omitting rogue taxa.}

\item{computeSupport}{Logical: If \code{FALSE}, then instead of trying to maximize
the support in the consensus tree, RogueNaRok will try to maximize the number
of bipartitions in the final tree by pruning taxa.}

\item{dropsetSize}{Integer specifying maximum size of dropset per iteration.
If \code{dropsetSize == n}, then RogueNaRok will test in each iteration which
tuple of \code{n} taxa increases the optimality criterion the most, pruning
taxa accordingly.
This improves the result, but run times will increase at least linearly.}

\item{neverDrop}{Tip labels that should not be dropped from the consensus.}

\item{labelPenalty}{A weight factor to penalize for dropset size when
\code{info = 'rbic'}.
The higher the value, the more conservative the algorithm is in pruning taxa.
The default value of \code{0} gives the \acronym{RBIC}; \code{1} gives Pattengale's
criterion.}

\item{threshold, mreOptimization}{A threshold or mode for the consensus tree
that is optimized. Specify a value between 50 (majority rule consensus,
the default) and 100 (strict consensus), or set \code{mreOptimization = TRUE}
for the extended majority rule consensus.
Note that rogue taxa identified with respect to different thresholds can
vary substantially.}

\item{verbose}{Logical specifying whether to display output from RogueNaRok.
If \code{FALSE}, output will be included as an attribute of the return value.}

\item{p}{Proportion of trees that must contain a split before it is included
in the consensus under consideration.  0.5, the default, corresponds to a
majority rule tree; 1.0 will maximize the information content of the
strict consensus.}

\item{log}{Logical specifying whether to log-transform distances when
calculating leaf stability.}

\item{average}{Character specifying whether to use \code{"mean"} or \code{"median"}
tip distances to calculate leaf stability.}

\item{deviation}{Character specifying whether to use \code{"sd"} or \code{"mad"} to
calculate leaf stability.}

\item{fullSeq}{Logical specifying whether to list all taxa (\code{TRUE}), or
only those that improve information content when all are dropped (\code{FALSE}).}

\item{bootTrees}{Path to a file containing a collection of bootstrap trees.}

\item{runId}{An identifier for this run, appended to output files.}

\item{treeFile, bestTree}{If a single best-known tree (such as an ML or MP tree)
is provided, RogueNaRok optimizes the bootstrap support in this
best-known tree (still drawn from the bootstrap trees);
the \code{threshold} parameter is ignored.}

\item{excludeFile}{Taxa in this file (one taxon per line) will not be
considered for pruning.}

\item{workDir}{Path to a working directory where output files are created.}
}
\value{
\code{RogueTaxa()} returns a \code{data.frame}. Each row after the first,
which describes the starting tree, describes a dropset operation.
Columns describe:
\itemize{
\item \code{num}: Sequential index of the drop operation
\item \code{taxNum}: Numeric identifier of the dropped leaves
\item \code{taxon}: Text identifier of dropped leaves
\item \code{rawImprovement}: Improvement in score obtained by this operation
\item \code{IC}: Information content of tree after dropping all leaves so far,
by the measure indicated by \code{info}.
}

\code{C_RogueNaRok()} returns \code{0} if successful; \code{-1} on error.
}
\description{
\code{RogueTaxa()} finds wildcard leaves whose removal increases the resolution
or branch support values of a consensus tree, using the relative
bipartition, shared phylogenetic, or mutual clustering concepts of
information.
}
\details{
"Rogue" or (loosely) "wildcard" taxa \insertCite{Nixon1992}{Rogue} are
leaves whose position in a tree is poorly constrained, typically because
much of the phylogenetic data associated with the taxon is either missing or
in conflict with other data \insertCite{Kearney2002}{Rogue}.

These functions use heuristic methods to identify rogue taxa whose removal
improves the information content of a consensus tree, by the definitions
of information discussed below.
}
\section{Functions}{
\itemize{
\item \code{QuickRogue()}: Shortcut to 'fast' heuristic, with option to return
evaluation of all taxa using \code{fullSeq = TRUE}.

}}
\section{Information criteria}{

The splitwise phylogenetic information content measure produces the best
results \insertCite{SmithCons}{Rogue}.
It uses the splitwise information content as a shortcut, which involves
double counting of some information (which may or may not be desirable).
The same holds for the mutual clustering information measure; this measure
is less obviously suited to the detection of rogues.
This measure interprets split frequency as a proxy for the probability
that a split is true, which is a valid interpretation of a Bayesian posterior
sample \insertCite{Holder2008}{Rogue},
a reasonable but imperfect interpretation of a bootstrap sample
\insertCite{Berry1996}{Rogue}, and a bad interpretation of a sample of
most parsimonious trees.

The "relative bipartition information criterion" (\acronym{RBIC}) is
the sum of all support values divided by the maximum possible support in a
fully bifurcating tree with the initial set of taxa.
The relative bipartition information content approach employs the
'RogueNaRok' implementation \insertCite{Aberer2013}{Rogue}, which can handle
large trees relatively quickly.
The \acronym{RBIC} is is not strictly a measure of information and can
produce undesirable results \insertCite{Wilkinson2017}{Rogue}.

\code{C_RogueNaRok()} directly interfaces the 'RogueNaRok' C implementation,
with no input checking; be aware that invalid input will cause undefined
behaviour and is likely to crash R.
}

\examples{
library("TreeTools", warn.conflicts = FALSE)

trees <- list(read.tree(text = ("(a, (b, (c, (d, (e, (X1, X2))))));")),
              read.tree(text = ("((a, (X1, X2)), (b, (c, (d, e))));")))
RogueTaxa(trees, dropsetSize = 2)

trees <- list(
     read.tree(text = '((a, y), (b, (c, (z, ((d, e), (f, (g, x)))))));'),
     read.tree(text = '(a, (b, (c, (z, (((d, y), e), (f, (g, x)))))));'),
     read.tree(text = '(a, (b, ((c, z), ((d, (e, y)), ((f, x), g)))));'),
     read.tree(text = '(a, (b, ((c, z), ((d, (e, x)), (f, (g, y))))));'),
     read.tree(text = '(a, ((b, x), ((c, z), ((d, e), (f, (g, y))))));')
     )
cons <- consensus(trees, p = 0.5)
plot(cons)
LabelSplits(cons, SplitFrequency(cons, trees) / length(trees))
reduced <- RogueTaxa(trees, info = 'phylogenetic', ret = 'tree')
plot(reduced)
LabelSplits(reduced, SplitFrequency(reduced, trees) / length(trees))

QuickRogue(trees, fullSeq = TRUE)

bootTrees <- system.file('example/150.bs', package = 'Rogue')
tmpDir <- tempdir()
XX <- capture.output( # Don't print verbose run details to console
  C_RogueNaRok(bootTrees, workDir = tmpDir)
)

# Results have been written to our temporary directory
oldwd <- setwd(tmpDir)
head(read.table('RogueNaRok_droppedRogues.tmp', header = TRUE))

# Delete temporary files
file.remove('RogueNaRok_droppedRogues.tmp')
file.remove('RogueNaRok_info.tmp')

setwd(oldwd)
}
\references{
\insertAllCited{}
}
\author{
\href{https://smithlabdurham.github.io/}{Martin R. Smith}
(\href{mailto:martin.smith@durham.ac.uk}{martin.smith@durham.ac.uk}), linking to
\href{https://github.com/aberer/RogueNaRok/}{RogueNaRok}
C library by Andre Aberer (<andre.aberer at googlemail.com>)
}
