% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/parse_functions.R
\name{biber}
\alias{biber}
\alias{biber.spacyr_parsed}
\alias{biber.udpipe_connlu}
\title{Extract Biber features from a document parsed and annotated by spacyr or udpipe}
\usage{
biber(
  tokens,
  measure = c("MATTR", "TTR", "CTTR", "MSTTR", "none"),
  normalize = TRUE
)

\method{biber}{spacyr_parsed}(
  tokens,
  measure = c("MATTR", "TTR", "CTTR", "MSTTR", "none"),
  normalize = TRUE
)

\method{biber}{udpipe_connlu}(
  tokens,
  measure = c("MATTR", "TTR", "CTTR", "MSTTR", "none"),
  normalize = TRUE
)
}
\arguments{
\item{tokens}{A dataset of tokens created by \code{spacyr::spacy_parse()} or
\code{udpipe::udpipe_annotate()}}

\item{measure}{Measure to use for type-token ratio. Passed to
\code{quanteda.textstats::textstat_lexdiv()} to calculate the statistic. Can be
the Moving Average Type-Token Ratio (MATTR), ordinary Type-Token Ratio
(TTR), corrected TTR (CTTR), Mean Segmental Type-Token Ratio (MSTTR), or
\code{"none"} to skip calculating a type-token ratio. If a statistic is chosen
but there are fewer than 200 token in the smallest document, the TTR is
used instead.}

\item{normalize}{If \code{TRUE}, count features are normalized to the rate per
1,000 tokens.}
}
\value{
A \code{data.frame} of features containing one row per document and one
column per feature. If \code{normalize} is \code{TRUE}, count features are normalized
to the rate per 1,000 tokens.
}
\description{
Takes data that has been part-of-speech tagged and dependency parsed and
extracts counts of features that have been used in Douglas Biber's research
since the late 1980s.
}
\details{
Refer to \code{spacyr::spacy_parse()} or \code{udpipe::udpipe_annotate()} for details
on parsing texts. These must be configured to do part-of-speech and
dependency parsing. For \code{spacyr::spacy_parse()}, use the \code{dependency = TRUE},
\code{tag = TRUE}, and \code{pos = TRUE} arguments; for \code{udpipe::udpipe_annotate()},
set the \code{tagger} and \code{parser} arguments to \code{"default"}.

Feature extraction relies on a dictionary (included as \code{\link{dict}}) and word
lists (\code{\link{word_lists}}) to match specific features; see their documentation
and values for details on the exact patterns and words matched by each. The
function identifies other features based on local cues, which are
approximations. Because they rely on probabilistic taggers provided by spaCy
or udpipe, the accuracy of the resulting counts are dependent on the accuracy
of those models. Thus, texts with irregular spellings, non-normative
punctuation, etc. will likely produce unreliable outputs, unless taggers are
tuned specifically for those purposes.

The following features are detected. Square brackets in example sentences
indicate the location of the feature.
\subsection{Tense and aspect markers}{

\describe{
\item{f_01_past_tense}{Verbs in the past tense.}
\item{f_02_perfect_aspect}{Verbs in the perfect aspect, indicated by "have" as an auxiliary verb (e.g. \emph{I [have] written this sentence.})"}
\item{f_03_present_tense}{Verbs in the present tense.}
}
}

\subsection{Place and time adverbials}{

\describe{
\item{f_04_place_adverbials}{Place adverbials (e.g., \emph{above}, \emph{beside}, \emph{outdoors}; see list in \code{dict$f_04_place_adverbials})}
\item{f_05_time_adverbials}{Time adverbials (e.g., \emph{early}, \emph{instantly}, \emph{soon}; see \code{dict$f_05_time_adverbials})}
}
}

\subsection{Pronouns and pro-verbs}{

\describe{
\item{f_06_first_person_pronouns}{First-person pronouns; see \code{dict$f_06_first_person_pronouns}}
\item{f_07_second_person_pronouns}{Second-person pronouns; see \code{dict$f_07_second_person_pronouns}}
\item{f_08_third_person_pronouns}{Third-person personal pronouns (excluding \emph{it}); see \code{dict$f_08_third_person_pronouns}}
\item{f_09_pronoun_it}{Pronoun \emph{it}, \emph{its}, or \emph{itself}}
\item{f_10_demonstrative_pronoun}{Pronouns being used to replace a noun (e.g. \emph{[That] is an example sentence.})}
\item{f_11_indefinite_pronouns}{Indefinite pronouns (e.g., \emph{anybody}, \emph{nothing}, \emph{someone}; see \code{dict$f_11_indefinite_pronouns})}
\item{f_12_proverb_do}{Pro-verb \emph{do}}
}
}

\subsection{Questions}{

\describe{
\item{f_13_wh_question}{Direct \emph{wh-} questions (e.g., \emph{When are you leaving?})}
}
}

\subsection{Nominal forms}{

\describe{
\item{f_14_nominalizations}{Nominalizations (nouns ending in \emph{-tion}, \emph{-ment}, \emph{-ness}, \emph{-ity}, e.g. \emph{adjustment}, \emph{abandonment})}
\item{f_15_gerunds}{Gerunds (participial forms functioning as nouns)}
\item{f_16_other_nouns}{Total other nouns}
}
}

\subsection{Passives}{

\describe{
\item{f_17_agentless_passives}{Agentless passives (e.g., \emph{The task [was done].})}
\item{f_18_by_passives}{\emph{by-} passives (e.g., \emph{The task [was done by Steve].})}
}
}

\subsection{Stative forms}{

\describe{
\item{f_19_be_main_verb}{\emph{be} as main verb}
\item{f_20_existential_there}{Existential \emph{there} (e.g., \emph{[There] is a feature in this sentence.})}
}
}

\subsection{Subordination features}{

\describe{
\item{f_21_that_verb_comp}{\emph{that} verb complements (e.g., \emph{I said [that he went].})}
\item{f_22_that_adj_comp}{\emph{that} adjective complements (e.g., \emph{I'm glad [that you like it].})}
\item{f_23_wh_clause}{\emph{wh-} clauses (e.g., \emph{I believed [what he told me].})}
\item{f_24_infinitives}{Infinitives}
\item{f_25_present_participle}{Present participial adverbial clauses (e.g., \emph{[Stuffing his mouth with cookies], Joe ran out the door.})}
\item{f_26_past_participle}{Past participial adverbial clauses (e.g., \emph{[Built in a single week], the house would stand for fifty years.})}
\item{f_27_past_participle_whiz}{Past participial postnominal (reduced relative) clauses (e.g., \emph{the solution [produced by this process]})}
\item{f_28_present_participle_whiz}{Present participial postnominal (reduced relative) clauses (e.g., \emph{the event [causing this decline]})}
\item{f_29_that_subj}{\emph{that} relative clauses on subject position (e.g., \emph{the dog [that bit me]})}
\item{f_30_that_obj}{\emph{that} relative clauses on object position (e.g., \emph{the dog [that I saw]})}
\item{f_31_wh_subj}{\emph{wh-} relatives on subject position (e.g., \emph{the man [who likes popcorn]})}
\item{f_32_wh_obj}{\emph{wh-} relatives on object position (e.g., \emph{the man [who Sally likes]})}
\item{f_33_pied_piping}{Pied-piping relative clauses (e.g., \emph{the manner [in which he was told]})}
\item{f_34_sentence_relatives}{Sentence relatives (e.g., \emph{Bob likes fried mangoes, [which is the most disgusting thing I've ever heard of].})}
\item{f_35_because}{Causative adverbial subordinator (\emph{because})}
\item{f_36_though}{Concessive adverbial subordinators (\emph{although}, \emph{though})}
\item{f_37_if}{Conditional adverbial subordinators (\emph{if}, \emph{unless})}
\item{f_38_other_adv_sub}{Other adverbial subordinators (e.g., \emph{since}, \emph{while}, \emph{whereas})}
}
}

\subsection{Prepositional phrases, adjectives, and adverbs}{

\describe{
\item{f_39_prepositions}{Total prepositional phrases}
\item{f_40_adj_attr}{Attributive adjectives (e.g., \emph{the [big] horse})}
\item{f_41_adj_pred}{Predicative adjectives (e.g., \emph{The horse is [big].})}
\item{f_42_adverbs}{Total adverbs}
}
}

\subsection{Lexical specificity}{

\describe{
\item{f_43_type_token}{Type-token ratio (including punctuation), using the statistic chosen in \code{measure}, or TTR if there are fewer than 200 tokens in the smallest document.}
\item{f_44_mean_word_length}{Average word length (across tokens, excluding punctuation)}
}
}

\subsection{Lexical classes}{

\describe{
\item{f_45_conjuncts}{Conjuncts (e.g., \emph{consequently}, \emph{furthermore}, \emph{however}; see \code{dict$f_45_conjuncts})}
\item{f_46_downtoners}{Downtoners (e.g., \emph{barely}, \emph{nearly}, \emph{slightly}; see \code{dict$f_46_downtoners})}
\item{f_47_hedges}{Hedges (e.g., \emph{at about}, \emph{something like}, \emph{almost}; see \code{dict$f_47_hedges})}
\item{f_48_amplifiers}{Amplifiers (e.g., \emph{absolutely}, \emph{extremely}, \emph{perfectly}; see \code{dict$f_48_amplifiers})}
\item{f_49_emphatics}{Emphatics (e.g., \emph{a lot}, \emph{for sure}, \emph{really}; see \code{dict$f_49_emphatics})}
\item{f_50_discourse_particles}{Discourse particles (e.g., sentence-initial \emph{well}, \emph{now}, \emph{anyway}; see \code{dict$f_50_discourse_particles})}
\item{f_51_demonstratives}{Demonstratives (\emph{that}, \emph{this}, \emph{these}, or \emph{those} used as determiners, e.g. \emph{[That] is the feature})}
}
}

\subsection{Modals}{

\describe{
\item{f_52_modal_possibility}{Possibility modals (\emph{can}, \emph{may}, \emph{might}, \emph{could})}
\item{f_53_modal_necessity}{Necessity modals (\emph{ought}, \emph{should}, \emph{must})}
\item{f_54_modal_predictive}{Predictive modals (\emph{will}, \emph{would}, \emph{shall})}
}
}

\subsection{Specialized verb classes}{

\describe{
\item{f_55_verb_public}{Public verbs (e.g., \emph{assert}, \emph{declare}, \emph{mention}; see \code{dict$f_55_verb_public})}
\item{f_56_verb_private}{Private verbs (e.g., \emph{assume}, \emph{believe}, \emph{doubt}, \emph{know}; see \code{dict$f_56_verb_private})}
\item{f_57_verb_suasive}{Suasive verbs (e.g., \emph{command}, \emph{insist}, \emph{propose}; see \code{dict$f_57_verb_suasive})}
\item{f_58_verb_seem}{\emph{seem} and \emph{appear}}
}
}

\subsection{Reduced forms and dispreferred structures}{

\describe{
\item{f_59_contractions}{Contractions}
\item{f_60_that_deletion}{Subordinator \emph{that} deletion (e.g., \emph{I think [he went].})}
\item{f_61_stranded_preposition}{Stranded prepositions (e.g., \emph{the candidate that I was thinking [of]})}
\item{f_62_split_infinitive}{Split infinitives (e.g., \emph{He wants [to convincingly prove] that ...})}
\item{f_63_split_auxiliary}{Split auxiliaries (e.g., \emph{They [were apparently shown] to ...})}
}
}

\subsection{Co-ordination}{

\describe{
\item{f_64_phrasal_coordination}{Phrasal co-ordination (N and N; Adj and Adj; V and V; Adv and Adv)}
\item{f_65_clausal_coordination}{Independent clause co-ordination (clause-initial \emph{and})}
}
}

\subsection{Negation}{

\describe{
\item{f_66_neg_synthetic}{Synthetic negation (e.g., \emph{No answer is good enough for Jones.})}
\item{f_67_neg_analytic}{Analytic negation (e.g., \emph{That isn't good enough.})}
}
}
}
\examples{
# Parse the example documents provided with the package
biber(udpipe_samples)

biber(spacy_samples)
}
\references{
Biber, Douglas (1985). "Investigating macroscopic textual
variation through multifeature/multidimensional analyses." \emph{Linguistics}
23(2), 337-360. \doi{10.1515/ling.1985.23.2.337}

Biber, Douglas (1988). \emph{Variation across Speech and Writing}.
Cambridge University Press.

Biber, Douglas (1995). \emph{Dimensions of Register Variation: A Cross-Linguistic
Comparison.} Cambridge University Press.

Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian Knot: The
Moving-Average Type–Token Ratio (MATTR). \emph{Journal of Quantitative
Linguistics}, 17(2), 94–100. \doi{10.1080/09296171003643098}
}
\seealso{
\link{dict}, \link{word_lists}
}
