% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus_trim.R
\name{corpus_trim}
\alias{corpus_trim}
\alias{char_trim}
\title{Remove sentences based on their token lengths or a pattern match}
\usage{
corpus_trim(
  x,
  what = c("sentences", "paragraphs", "documents"),
  min_ntoken = 1,
  max_ntoken = NULL,
  exclude_pattern = NULL
)

char_trim(
  x,
  what = c("sentences", "paragraphs", "documents"),
  min_ntoken = 1,
  max_ntoken = NULL,
  exclude_pattern = NULL
)
}
\arguments{
\item{x}{\link{corpus} or character object whose sentences will be selected.}

\item{what}{units of trimming, \code{"sentences"} or \code{"paragraphs"}, or
\code{"documents"}}

\item{min_ntoken, max_ntoken}{minimum and maximum lengths in word tokens
(excluding punctuation).  Note that these are approximate numbers of tokens
based on checking for word boundaries, rather than on-the-fly full
tokenisation.}

\item{exclude_pattern}{a \pkg{stringi} regular expression whose match (at the
sentence level) will be used to exclude sentences}
}
\value{
a \link{corpus} or character vector equal in length to the input.  If
the input was a corpus, then the all docvars and metadata are preserved.
For documents whose sentences have been removed entirely, a null string
(\code{""}) will be returned.
}
\description{
Removes sentences from a corpus or a character vector shorter than a
specified length.
}
\examples{
txt <- c("PAGE 1. This is a single sentence.  Short sentence. Three word sentence.",
         "PAGE 2. Very short! Shorter.",
         "Very long sentence, with multiple parts, separated by commas.  PAGE 3.")
corp <- corpus(txt, docvars = data.frame(serial = 1:3))
corp

# exclude sentences shorter than 3 tokens
corpus_trim(corp, min_ntoken = 3)
# exclude sentences that start with "PAGE <digit(s)>"
corpus_trim(corp, exclude_pattern = "^PAGE \\\\d+")

# trimming character objects
char_trim(txt, "sentences", min_ntoken = 3)
char_trim(txt, "sentences", exclude_pattern = "sentence\\\\.")
}
\keyword{character}
\keyword{corpus}
