% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/get_cos_sim.R
\name{get_cos_sim}
\alias{get_cos_sim}
\title{Given a tokenized corpus, compute the cosine similarities
of the resulting ALC embeddings and a defined set of features.}
\usage{
get_cos_sim(
  x,
  groups = NULL,
  features = character(0),
  pre_trained,
  transform = TRUE,
  transform_matrix,
  bootstrap = TRUE,
  num_bootstraps = 10,
  stem = FALSE,
  as_list = TRUE
)
}
\arguments{
\item{x}{a (quanteda) \code{tokens-class} object}

\item{groups}{(numeric, factor, character) a binary variable of the same length as \code{x}}

\item{features}{(character) features of interest}

\item{pre_trained}{(numeric) a F x D matrix corresponding to pretrained embeddings.
F = number of features and D = embedding dimensions.
rownames(pre_trained) = set of features for which there is a pre-trained embedding.}

\item{transform}{(logical) if TRUE (default) apply the 'a la carte' transformation,
if FALSE ouput untransformed averaged embeddings.}

\item{transform_matrix}{(numeric) a D x D 'a la carte' transformation matrix.
D = dimensions of pretrained embeddings.}

\item{bootstrap}{(logical) if TRUE, use bootstrapping -- sample from texts with replacement and
re-estimate cosine similarities for each sample. Required to get std. errors.
If \code{groups} defined, sampling is automatically stratified.}

\item{num_bootstraps}{(integer) number of bootstraps to use.}

\item{stem}{(logical) - If TRUE, both \code{features} and \code{rownames(pre_trained)}
are stemmed and average cosine similarities are reported.
We recommend you remove misspelled words from  \code{pre_trained} as these can
significantly influence the average.}

\item{as_list}{(logical) if FALSE all results are combined into a single data.frame
If TRUE, a list of data.frames is returned with one data.frame per feature.}
}
\value{
a \code{data.frame} or list of data.frames (one for each target)
with the following columns:
\describe{
\item{\code{target}}{ (character) rownames of \code{x},
the labels of the ALC embeddings.}
\item{\code{feature}}{(character) feature terms defined in
the \code{features} argument.}
\item{\code{value}}{(numeric) cosine similarity between \code{x}
and feature. Average over bootstrapped samples if bootstrap = TRUE.}
\item{\code{std.error}}{(numeric) std. error of the similarity value.
Column is dropped if bootstrap = FALSE.}
}
}
\description{
This is a wrapper function for \code{cos_sim()} that allows users to go from a
tokenized corpus to results with the option to bootstrap cosine similarities
and get the corresponding std. errors.
}
\examples{

library(quanteda)

# tokenize corpus
toks <- tokens(cr_sample_corpus)

# build a tokenized corpus of contexts sorrounding a target term
immig_toks <- tokens_context(x = toks, pattern = "immigr*", window = 6L)

# compute the cosine similarity between each group's embedding
# and a specific set of features
set.seed(2021L)
get_cos_sim(x = immig_toks,
            groups = docvars(immig_toks, 'party'),
            features = c("reform", "enforce"),
            pre_trained = cr_glove_subset,
            transform = TRUE,
           transform_matrix = cr_transform,
            bootstrap = TRUE,
            num_bootstraps = 10,
            stem = TRUE,
            as_list = FALSE)
}
\keyword{get_cos_sim}
