% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/clean_source.R, R/clean_sources.R,
%   R/search_and_replace_in_source.R, R/search_and_replace_in_sources.R
\name{clean_source}
\alias{clean_source}
\alias{clean_sources}
\alias{search_and_replace_in_source}
\alias{search_and_replace_in_sources}
\title{Cleaning & editing sources}
\usage{
clean_source(input, output = NULL,
  replacementsPre = rock::opts$get(replacementsPre),
  replacementsPost = rock::opts$get(replacementsPost),
  extraReplacementsPre = NULL, extraReplacementsPost = NULL,
  removeNewlines = FALSE,
  utteranceSplits = rock::opts$get(utteranceSplits),
  preventOverwriting = rock::opts$get(preventOverwriting),
  encoding = rock::opts$get(encoding), silent = rock::opts$get(silent))

clean_sources(input, output, filenamePrefix = "", filenameSuffix = "",
  recursive = TRUE, filenameRegex = ".*",
  replacementsPre = rock::opts$get(replacementsPre),
  replacementsPost = rock::opts$get(replacementsPost),
  extraReplacementsPre = NULL, extraReplacementsPost = NULL,
  removeNewlines = FALSE,
  utteranceSplits = rock::opts$get(utteranceSplits),
  preventOverwriting = rock::opts$get(preventOverwriting),
  encoding = rock::opts$get(encoding), silent = rock::opts$get(silent))

search_and_replace_in_source(input, replacements = NULL, output = NULL,
  preventOverwriting = TRUE, encoding = "UTF-8", silent = FALSE)

search_and_replace_in_sources(input, output, replacements = NULL,
  filenamePrefix = "", filenameSuffix = "_postReplacing",
  preventOverwriting = rock::opts$get(preventOverwriting),
  recursive = TRUE, filenameRegex = ".*",
  encoding = rock::opts$get(encoding), silent = FALSE)
}
\arguments{
\item{input}{For \code{clean_source} and \code{search_and_replace_in_source}, either a character
vector containing the text of the relevant source \emph{or} a path to a file that contains
the source text; for \code{clean_sources} and \code{search_and_replace_in_sources}, a path to a
directory that contains the sources to clean.}

\item{output}{For \code{clean_source} and \code{search_and_replace_in_source}, if not \code{NULL},
this is the name (and path) of the file in which to save the processed source (if it
\emph{is} \code{NULL}, the result will be returned visibly). For \code{clean_sources} and
\code{search_and_replace_in_sources}, \code{output} is mandatory and is the path to the
directory where to store the processed sources. This path will be created with a
warning if it does not exist. An exception is if "\code{same}" is specified - in that
case, every file will be written to the same directory it was read from.}

\item{replacementsPre, replacementsPost}{Each is a list of two-element vectors,
where the first element in each vector contains a regular expression to search for
in the source(s), and the second element contains the replacement (these are passed
as \code{perl} regular expressions; see \code{\link{regex}} for more information).
Instead of regular expressions, simple words or phrases can also be entered of
course (since those are valid regular expressions). \code{replacementsPre} are executed
before the \code{utteranceSplits} are applied; \code{replacementsPost} afterwards.}

\item{extraReplacementsPre, extraReplacementsPost}{To perform more replacements
than the default set, these can be conveniently specified in \code{extraReplacementsPre}
and \code{extraReplacementsPost}. This prevents you from having to
manually copypaste the list of defaults to retain it.}

\item{removeNewlines}{Whether to remove all newline characters from the source before
starting to clean them.}

\item{utteranceSplits}{This is a vector of regular expressions that specify where to
insert breaks between utterances in the source(s). Such breakes are specified using
\code{utteranceMarker}.}

\item{preventOverwriting}{Whether to prevent overwriting of output files.}

\item{encoding}{The encoding of the source(s).}

\item{silent}{Whether to suppress the warning about not editing the cleaned source.}

\item{filenamePrefix, filenameSuffix}{The prefix and suffix to add to the
filenames when writing the processed files to disk.}

\item{recursive}{Whether to search all subdirectories (\code{TRUE}) as well or not.}

\item{filenameRegex}{A regular expression to match against located files; only
files matching this regular expression are processed.}

\item{replacements}{The strings to search & replace, as a list of two-element vectors,
where the first element in each vector contains a regular expression to search for
in the source(s), and the second element contains the replacement (these are passed
as \code{perl} regular expressions; see \code{\link{regex}} for more information).
Instead of regular expressions, simple words or phrases can also be entered of
course (since those are valid regular expressions).}
}
\value{
A character vector for \code{clean_source}, or a list of character vectors,
for \code{clean_sources}.
}
\description{
These function can be used to 'clean' one or more sources or perform search and
replace taks. Cleaning consists of two operations: splitting the source at
utterance markers, and conducting search and replaces using regular expressions.
}
\details{
The cleaning functions, when called with their default arguments, will do the following:
\itemize{
\item Double periods (\code{..}) will be replaced with single periods (\code{.})
\item Four or more periods (\code{...} or \code{.....}) will be replaced with three periods
\item Three or more newline characters will be replaced by one newline character (which
will become more, if the sentence before that character marks the end of an
utterance)
\item All sentences will become separate utterances (in a semi-smart manner;
specifically, breaks in speaking, if represented by three periods, are not
considered sentence ends, wheread ellipses ("…" or unicode 2026, see the example) \emph{are}.
\item If there are comma's without a space following them, a space will be inserted.
}
}
\examples{
exampleSource <-
"Do you like icecream?


Well, that depends\\u2026 Sometimes, when it's..... Nice. Then I do,
but otherwise... not really, actually."

### Default settings:
cat(clean_source(exampleSource));

### First remove existing newlines:
cat(clean_source(exampleSource,
                 removeNewlines=TRUE));

exampleSource <-
"Do you like icecream?


Well, that depends\\u2026 Sometimes, when it's..... Nice. Then I do,
but otherwise... not really, actually."

### Simple text replacements:
cat(search_and_replace_in_source(exampleSource,
                                 replacements=list(c("\\u2026", "..."),
                                                   c("Nice", "Great"))));

### Using a regular expression to capitalize all words following
### a period:
cat(search_and_replace_in_source(exampleSource,
                                 replacements=list(c("\\\\.(\\\\s*)([a-z])", ".\\\\1\\\\U\\\\2"))));

}
