## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE, warning = F, message = F,
  comment = "#>"
)

## -----------------------------------------------------------------------------
library(sentixr)

## -----------------------------------------------------------------------------
data(recensioni_tv)
recensioni_tv

## -----------------------------------------------------------------------------
library(tidytext)

## -----------------------------------------------------------------------------
# Get the MAL lexicon (inflected forms)
mal_dict <- get_sentix("MAL")
head(mal_dict)

## -----------------------------------------------------------------------------
# Tokenize
tidy_text <- recensioni_tv |> 
  unnest_tokens(word, text)

## -----------------------------------------------------------------------------
# Join with lexicon
tidy_sent <- tidy_text |>
  left_join(mal_dict, by = "word")

head(tidy_sent)

## -----------------------------------------------------------------------------
# Calculate average sentiment per document
sentix_summarize(tidy_sent, simplify = FALSE)

## -----------------------------------------------------------------------------
# Manual summary with dplyr
tidy_sent |>
  group_by(doc_id) |>
  summarise(
    sentiment = mean(score, na.rm = T),
    n_tokens = n(),
    n_scored = sum(!is.na(score))
  )

## -----------------------------------------------------------------------------
# Get MAL with polarity labels
polar_dict <- get_sentix("MAL", polarity = TRUE)
head(polar_dict)

## -----------------------------------------------------------------------------
# Join with tokenized text
tidy_text |>
  left_join(polar_dict, by = "word") |>
  head()

## -----------------------------------------------------------------------------
mal_dict |> 
  mutate(polarity = make_polarity(score, 
                                  threshold = 0.125)) |> 
  head()

## -----------------------------------------------------------------------------
get_elita() |> 
  mutate(across(where(is.numeric), 
                ~ make_polarity(.x))) |> 
  tail()

## -----------------------------------------------------------------------------
library(quanteda)

## ----data---------------------------------------------------------------------
data(recensioni_tv)
sentix_toks <- corpus(recensioni_tv) |>
  tokens(remove_punct = TRUE)

## -----------------------------------------------------------------------------
# Convert MAL to a valence dictionary
my_dict <- df_to_dict(mal_dict)

## ----eval=FALSE---------------------------------------------------------------
# df_to_valence(MAL)

## ----eval = FALSE-------------------------------------------------------------
# # Compute valence
# quanteda.sentiment::textstat_valence(sentix_toks, dictionary = my_dict)
# #>   doc_id  sentiment
# #> 1   doc1  0.2689482
# #> 2   doc2 -0.1755017
# #> 3   doc3  0.2788701
# #> 4   doc4  0.1295423
# #> 5   doc5 -0.0208181

## ----eval=FALSE---------------------------------------------------------------
# my_dict2 <- get_sentix("MAL", polarity = TRUE) |>
#   # if there are other numeric columns, other than 'polarity'
#   df_to_polar()

## -----------------------------------------------------------------------------
my_dict2 <- df_to_dict(polar_dict)

## ----eval = FALSE-------------------------------------------------------------
# # Compute polarity scores
# quanteda.sentiment::textstat_polarity(sentix_toks,
#                                       dictionary = my_dict2)
# #>   doc_id sentiment
# #> 1   doc1 2.8332133
# #> 2   doc2 0.0000000
# #> 3   doc3 1.4663371
# #> 4   doc4 0.9555114
# #> 5   doc5 0.0000000

