% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/corpus_functions.R
\name{CreateDtm}
\alias{CreateDtm}
\title{Convert a character vector to a document term matrix.}
\usage{
CreateDtm(doc_vec, doc_names = names(doc_vec), ngram_window = c(1, 1),
  stopword_vec = c(stopwords::stopwords("en"),
  stopwords::stopwords(source = "smart")), lower = TRUE,
  remove_punctuation = TRUE, remove_numbers = TRUE,
  stem_lemma_function = NULL, verbose = FALSE, ...)
}
\arguments{
\item{doc_vec}{A character vector of documents.}

\item{doc_names}{A vector of names for your documents. Defaults to 
\code{names(doc_vec)}. If NULL, then doc_names is set to be 
\code{1:length(doc_vec)}.}

\item{ngram_window}{A numeric vector of length 2. The first entry is the minimum
n-gram size; the second entry is the maximum n-gram size. Defaults to
\code{c(1, 1)}.}

\item{stopword_vec}{A character vector of stopwords you would like to remove.
Defaults to \code{c(stopwords::stopwords("en"), stopwords::stopwords(source = "smart"))}. 
If you do not want stopwords removed, specify \code{stopword_vec = c()}.}

\item{lower}{Do you want all words coerced to lower case? Defaults to \code{TRUE}}

\item{remove_punctuation}{Do you want to convert all non-alpha numeric 
characters to spaces? Defaults to \code{TRUE}}

\item{remove_numbers}{Do you want to convert all numbers to spaces? Defaults 
to \code{TRUE}}

\item{stem_lemma_function}{A function that you would like to apply to the 
documents for stemming, lemmatization, or similar. See examples for
usage.}

\item{verbose}{Defaults to \code{TRUE}. Do you want to see status during 
vectorization?}

\item{...}{Other arguments to be passed to \code{\link[textmineR]{TmParallelApply}}.}
}
\value{
A document term matrix of class \code{dgCMatrix}. The rows index 
documents. The columns index terms. The i, j entries represent the count of 
term j appearing in document i.
}
\description{
This is the main document term matrix creating function for \code{textmineR}.
In most cases, all you need to do is import documents as a character vector in R and then 
run this function to get a document term matrix that is compatible with the 
rest of \code{textmineR}'s functionality and many other libraries. \code{CreateDtm}
is built on top of the excellent \code{\link[text2vec]{text2vec}} library.
}
\note{
The following transformations are applied to \code{stopword_vec} as 
      well as \code{doc_vec}: 
      \code{lower}, 
      \code{remove_punctuation}, 
      \code{remove_numbers}
      
      See \code{\link[tm]{stopwords}} for details on the default to the 
      \code{stopword_vec} argument.
}
\examples{
\dontrun{
data(nih_sample)

# DTM of unigrams and bigrams
dtm <- CreateDtm(doc_vec = nih_sample$ABSTRACT_TEXT,
                 doc_names = nih_sample$APPLICATION_ID, 
                 ngram_window = c(1, 2))

# DTM of unigrams with Porter's stemmer applied
dtm <- CreateDtm(doc_vec = nih_sample$ABSTRACT_TEXT,
                 doc_names = nih_sample$APPLICATION_ID,
                 stem_lemma_function = function(x) SnowballC::wordStem(x, "porter"))
}
}
