% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textstat_frequency.R
\name{textstat_frequency}
\alias{textstat_frequency}
\title{Tabulate feature frequencies}
\usage{
textstat_frequency(x, n = NULL, groups = NULL)
}
\arguments{
\item{x}{a \link{dfm} object}

\item{n}{(optional) integer specifying the top \code{n} features to be returned,
within group if \code{groups} is specified}

\item{groups}{either: a character vector containing the names of document 
variables to be used for grouping; or a factor or object that can be 
coerced into a factor equal in length or rows to the number of documents. 
See \link{groups} for details.}
}
\value{
a data.frame containing the following variables:
\describe{
\item{\code{feature}}{(character) the feature}
\item{\code{frequency}}{count of the feature}
\item{\code{rank}}{rank of the feature, where 1 indicates the greatest
frequency}
\item{\code{docfreq}}{document frequency of the feature, as a count (the
number of documents in which this feature occurred at least once)}
\item{\code{docfreq}}{document frequency of the feature, as a count}
\item{\code{group}}{(only if \code{groups} is specified) the label of the group.
If the features have been grouped, then all counts, ranks, and document
frequencies are within group.  If groups is not specified, the \code{group}
column is omitted from the returned data.frame.}
}

\code{textstat_frequency} returns a data.frame of features and
  their term and document frequencies within groups.
}
\description{
Produces counts and document frequencies summaries of the features in a
\link{dfm}, optionally grouped by a \link{docvars} variable or other supplied
grouping variable.
}
\examples{
dfm1 <- dfm(c("a a b b c d", "a d d d", "a a a"))
textstat_frequency(dfm1)
textstat_frequency(dfm1, groups = c("one", "two", "one"))

obamadfm <- 
    corpus_subset(data_corpus_inaugural, President == "Obama") \%>\%
    dfm(remove_punct = TRUE, remove = stopwords("english"))
freq <- textstat_frequency(obamadfm)
head(freq, 10)

\donttest{
# plot 20 most frequent words
library("ggplot2")
ggplot(freq[1:20, ], aes(x = reorder(feature, frequency), y = frequency)) +
    geom_point() + 
    coord_flip() +
    labs(x = NULL, y = "Frequency")

# plot relative frequencies by group
dfm_weight_pres <- data_corpus_inaugural \%>\% 
    corpus_subset(Year > 2000) \%>\% 
    dfm(remove = stopwords("english"), remove_punct = TRUE) \%>\% 
    dfm_group(groups = "President") \%>\%
    dfm_weight(scheme = "prop")

# calculate relative frequency by president
freq_weight <- textstat_frequency(dfm_weight_pres, n = 10,
                                  groups = "President")

# plot frequencies
ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
    geom_point() +
    facet_wrap(~ group, scales = "free") +
    coord_flip() +
    scale_x_continuous(breaks = nrow(freq_weight):1,
                       labels = freq_weight$feature) +
    labs(x = NULL, y = "Relative frequency")
}
}
\keyword{plot}
