% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/extract_phrases.R
\name{extract_phrases}
\alias{extract_phrases}
\title{Extract Phrases}
\usage{
extract_phrases(POS_tagged_documents, regex = "(A|N)*N(PD*(A|N)*N)*",
  maximum_ngram_length = 8, minimum_ngram_length = 2,
  return_phrase_vectors = TRUE, return_tag_sequences = FALSE)
}
\arguments{
\item{POS_tagged_documents}{A list object of the form produced by the
`POS_tag_documents()` function, with either Penn TreeBank or Petrov/Gimpel
style tags.}

\item{regex}{The regular expression used to find phrases. Defaults to
"(A|N)*N(PD*(A|N)*N)*", the "SimpleNP" grammar in Handler et al. 2016.}

\item{maximum_ngram_length}{The maximum length phrases returned. Defaults to
8. Increasing this number can greatly increase runtime.}

\item{minimum_ngram_length}{The minimum length phrases returned. Defaults to
2. Can be increased to remove shorter phrases, or decreased to include
unigrams.}

\item{return_phrase_vectors}{Logical indicating whether a list of phrase
vectors (with each entry contain a vector of phrases in one document) should
be returned, or whether phrases should combined into a single space separated
string.}

\item{return_tag_sequences}{Logical indicating whether tag sequences should
be returned along with phrases. Defaults to FALSE.}
}
\value{
A list object.
}
\description{
Extracts phrases from a list of POS tagged document using the
"FilterFSA" method in Handler et al. 2016.
}
\examples{
\dontrun{
# load data
corp <- quanteda::corpus(quanteda::inaugTexts)
documents <- quanteda::texts(corp)[1:5]

# run tagger
tagged_documents <- POS_tag_documents(documents)

phrases <- extract_phrases(tagged_documents,
                           regex = "(A|N)*N(PD*(A|N)*N)*",
                           maximum_ngram_length = 8,
                           minimum_ngram_length = 1)
}
}

