% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/Functions.R
\name{gap}
\alias{gap}
\title{Gap statistic algorithm.}
\usage{
gap(
  data,
  rss,
  meth = c("kmeans", "uniform", "dirichlet", "nmf"),
  itr = 50,
  lr = 0.01,
  ncore = 2
)
}
\arguments{
\item{data}{Data matrix or data frame.}

\item{rss}{Numeric vector, residual sum of squares from ssmf model using the number of clusters \eqn{1,2, \ldots, k}.}

\item{meth}{Character, specification of method to initialise the \eqn{W} and \eqn{H} matrix, see 'method' in init( ).}

\item{itr}{Integer, number of Monte Carlo samples.}

\item{lr}{Optimisation learning rate in ssmf().}

\item{ncore}{The number of cores to use for parallel execution.}
}
\value{
\code{gap} Gap value vector.

\code{optimal.k} The optimal number of prototypes/clusters.

\code{standard.error} Standard error vector.
}
\description{
Estimating the number of prototypes/clusters in a data set using the gap statistic.
}
\details{
This gap statistic selects the biggest difference between the original residual sum of squares (RSS) and the RSS under an appropriate null reference distribution of the data, which is defined to be
\deqn{\mathrm{Gap}(k) = \frac{1}{B} \sum_{b=1}^{B} \log(\mathrm{RSS}^*_{kb}) - \log(\mathrm{RSS}_{k}),}

where \eqn{B} is the number of samples from the reference distribution;
\eqn{\mathrm{RSS}^*_{kb}} is the residual sum of squares of the \eqn{b^th} sample from the reference distribution fitted in the SSMF model model using \eqn{k} clusters;
\eqn{RSS_{k}} is the residual sum of squares for the original data \eqn{X} fitted the model using the same \eqn{k}.
The estimated gap suggests the number of prototypes/clusters (\eqn{\hat{k}}) using

\deqn{\hat{k} = \mathrm{smallest} \ k \  \mathrm{such \ that} \ \mathrm{Gap}(k) \geq \mathrm{Gap}(k+1) - s_{k+1},}

where \eqn{s_{k+1}} is standard error that is defined as

\deqn{s_{k+1}=sd_k \sqrt{1+\frac{1}{B}},}

and \eqn{sd_k} is the standard deviation:

\deqn{sd_k=\sqrt{ \frac{1}{B} \sum_{b} [\log(\mathrm{RSS}^*_{kb})-\frac{1}{B} \sum_{b} \log(\mathrm{RSS}^*_{kb})]^2}.}
}
\examples{
# example code
\donttest{
data <- SimulatedDataset

k <- 6

rss <- rep(NA, k)
for(i in 1:k){
  rss[i] <- ssmf(data = data, k = i)$SSE
}

gap(data = data, rss = rss)
}
}
\references{
Tibshirani, R., Walther, G., & Hastie, T. (2001). Estimating the Number of Clusters in a Data Set via the Gap Statistic. Journal of the Royal Statistical Society. Series B (Statistical Methodology), 63(2), 411–423. <doi:10.1111/1467-9868.00293>
}
\author{
Wenxuan Liu
}
