% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/CreateTcm.R
\name{CreateTcm}
\alias{CreateTcm}
\title{Convert a character vector to a term co-occurence matrix.}
\usage{
CreateTcm(doc_vec, skipgram_window = Inf, ngram_window = c(1, 1),
  stopword_vec = c(tm::stopwords("english"), tm::stopwords("SMART")),
  lower = TRUE, remove_punctuation = TRUE, remove_numbers = TRUE,
  stem_lemma_function = NULL, ...)
}
\arguments{
\item{doc_vec}{A character vector of documents.}

\item{skipgram_window}{An integer window, from \code{0} to \code{Inf} for 
skip-grams. Defaults to \code{Inf}. See 'Details', below.}

\item{ngram_window}{A numeric vector of length 2. The first entry is the minimum
n-gram size; the second entry is the maximum n-gram size. Defaults to
\code{c(1, 1)}. Must be \code{c(1, 1)} if \code{skipgram_window} is 
not \code{0} or \code{Inf}.}

\item{stopword_vec}{A character vector of stopwords you would like to remove.
Defaults to \code{c(tm::stopwords("english"), tm::stopwords("SMART"))}. 
If you do not want stopwords removed, specify \code{stopword_vec = c()}.}

\item{lower}{Do you want all words coerced to lower case? Defaults to \code{TRUE}}

\item{remove_punctuation}{Do you want to convert all non-alpha numeric 
characters to spaces? Defaults to \code{TRUE}}

\item{remove_numbers}{Do you want to convert all numbers to spaces? Defaults 
to \code{TRUE}}

\item{stem_lemma_function}{A function that you would like to apply to the 
documents for stemming, lemmatization, or similar. See examples for
usage.}

\item{...}{Other arguments to be passed to \code{\link[textmineR]{TmParallelApply}}.}
}
\value{
A document term matrix of class \code{dgCMatrix}. The rows index 
documents. The columns index terms. The i, j entries represent the count of 
term j appearing in document i.
}
\description{
This is the main term co-occurence matrix creating function for \code{textmineR}.
In most cases, all you need to do is import documents as a character vector in R and then 
run this function to get a term co-occurence matrix that is compatible with the 
rest of \code{textmineR}'s functionality and many other libraries. \code{CreateTcm}
is built on top of the excellent \code{\link[text2vec]{text2vec}} library.
}
\details{
Setting \code{skipgram_window} counts the number of times that term
         \code{j} appears within \code{skipgram_window} places of term \code{i}.
         \code{Inf} and \code{0} create somewhat special TCMs. Setting \code{skipgram_window}
         to \code{Inf} counts the number of times that term \code{j} appears 
         across all documents containing \code{i}. Setting \code{skipgram_window}
         to \code{0} counts the number of documents in which term \code{j} 
         and term \code{i} occur together. A TCM where \code{skipgram_window} 
         is \code{0} is the only TCM that will be symmetric.
}
\note{
The following transformations are applied to \code{stopword_vec} as 
      well as \code{doc_vec}: 
      \code{lower}, 
      \code{remove_punctuation}, 
      \code{remove_numbers}
      
      See \code{\link[tm]{stopwords}} for details on the default to the 
      \code{stopword_vec} argument.
}
\examples{
\dontrun{
data(nih_sample)

# TCM of unigrams and bigrams
tcm <- CreateTcm(doc_vec = nih_sample$ABSTRACT_TEXT,
                 skipgram_window = Inf, 
                 ngram_window = c(1, 2))

# TCM of unigrams and a skip=gram window of 3, applying Porter's word stemmer
tcm <- CreateTcm(doc_vec = nih_sample$ABSTRACT_TEXT,
                 skipgram_window = 3,
                 stem_lemma_function = function(x) SnowballC::wordStem(x, "porter"))
}
}

