% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/feature_preparation.r
\name{term_char_sim}
\alias{term_char_sim}
\title{Find terms with similar spelling}
\usage{
term_char_sim(
  voc,
  type = c("tri", "bi"),
  min_overlap = 2/3,
  max_diff = 4,
  pad = F,
  as_lower = T,
  same_start = 1,
  drop_non_alpha = T,
  min_length = 5,
  allow_asym = F,
  verbose = T
)
}
\arguments{
\item{voc}{A character vector that gives the vocabulary (e.g., colnames of a dtm)}

\item{type}{Either "bi" (bigrams) or "tri" (trigrams)}

\item{min_overlap}{The minimal overlap percentage. Works together with max_diff to determine required overlap}

\item{max_diff}{The maximum number of bi/tri-grams that is different}

\item{pad}{If True, pad the left size (ls) and right side (rs) of bi/tri-grams. So, trigrams for "pad" would be: "ls_ls_p", "ls_p_a", "p_a_d", "a_d_rs", "d_rs_rs".}

\item{as_lower}{If True, ignore case}

\item{same_start}{Should terms start with the same character(s)? Given as a number for the number of same characters. (also greatly speeds up calculation)}

\item{drop_non_alpha}{If True, ignore non alpha terms (e.g., numbers, punctuation). They will appear in the output matrix, but only with zeros.}

\item{min_length}{The minimum number of characters in a term. Terms with fewer characters are ignored. They will appear in the output matrix, but only with zeros.}

\item{allow_asym}{If True, the match only needs to be true for at least one term. In practice, this means that "America" would match perfectly with "Southern-America".}

\item{verbose}{If True, report progress}
}
\value{
A similarity matrix in the CsparseMatrix format
}
\description{
A quick, language agnostic way for finding terms with similar spelling. 
Calculates similarity as percentage of a terms bigram's or trigram's that also occur in the other term. 
The percentage has to be above the given threshold for both terms (unless allow_asym = T)
}
\examples{
dfm = quanteda::dfm(c('That guy Gadaffi','Do you mean Kadaffi?',
                      'Nah more like Gadaffel','What Gargamel?'))
simmat = term_char_sim(colnames(dfm), same_start=0)
term_union(dfm, simmat, verbose = FALSE)
}
