% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/auk-filter.r
\name{auk_filter}
\alias{auk_filter}
\alias{auk_filter.auk_ebd}
\alias{auk_filter.auk_sampling}
\title{Filter the eBird file using AWK}
\usage{
auk_filter(x, file, ...)

\method{auk_filter}{auk_ebd}(x, file, file_sampling, keep, drop, awk_file,
  sep = "\\t", filter_sampling = TRUE, execute = TRUE,
  overwrite = FALSE, ...)

\method{auk_filter}{auk_sampling}(x, file, keep, drop, awk_file,
  sep = "\\t", execute = TRUE, overwrite = FALSE, ...)
}
\arguments{
\item{x}{\code{auk_ebd} or \code{auk_sampling} object; reference to file created by
\code{\link[=auk_ebd]{auk_ebd()}} or \code{\link[=auk_sampling]{auk_sampling()}}.}

\item{file}{character; output file.}

\item{...}{arguments passed on to methods.}

\item{file_sampling}{character; optional output file for sampling data.}

\item{keep}{character; a character vector specifying the names of the columns
to keep in the output file. Columns should be as they appear in the header
of the EBD; however, names are not case sensitive and spaces may be
replaced by underscores, e.g. \code{"COMMON NAME"}, \code{"common name"}, and
\code{"common_NAME"} are all valid.}

\item{drop}{character; a character vector of columns to drop in the same
format as \code{keep}. Ignored if \code{keep} is supplied.}

\item{awk_file}{character; output file to optionally save the awk script to.}

\item{sep}{character; the input field separator, the eBird file is tab
separated by default. Must only be a single character and space delimited
is not allowed since spaces appear in many of the fields.}

\item{filter_sampling}{logical; whether the sampling event data should also
be filtered.}

\item{execute}{logical; whether to execute the awk script, or output it to a
file for manual execution. If this flag is \code{FALSE}, \code{awk_file} must be
provided.}

\item{overwrite}{logical; overwrite output file if it already exists}
}
\value{
An \code{auk_ebd} object with the output files set. If \code{execute = FALSE},
then the path to the AWK script is returned instead.
}
\description{
Convert the filters defined in an \code{auk_ebd} object into an AWK script and run
this script to produce a filtered eBird Reference Dataset (ERD). The initial
creation of the \code{auk_ebd} object should be done with \code{\link[=auk_ebd]{auk_ebd()}} and filters
can be defined using the various other functions in this package, e.g.
\code{\link[=auk_species]{auk_species()}} or \code{\link[=auk_country]{auk_country()}}. \strong{Note that this function typically takes
at least a couple hours to run on the full dataset}
}
\details{
If a sampling file is provided in the \link[=auk_ebd]{auk_ebd} object, this
function will filter both the eBird Basic Dataset and the sampling data using
the same set of filters. This ensures that the files are in sync, i.e. that
they contain data on the same set of checklists.

The AWK script can be saved for future reference by providing an output
filename to \code{awk_file}. The default behavior of this function is to generate
and run the AWK script, however, by setting \code{execute = FALSE} the AWK script
will be generated but not run. In this case, \code{file} is ignored and \code{awk_file}
must be specified.

Calling this function requires that the command line utility AWK is
installed. Linux and Mac machines should have AWK by default, Windows users
will likely need to install \href{https://www.cygwin.com}{Cygwin}.
}
\section{Methods (by class)}{
\itemize{
\item \code{auk_ebd}: \code{auk_ebd} object

\item \code{auk_sampling}: \code{auk_sampling} object
}}

\examples{
# get the path to the example data included in the package
# in practice, provide path to ebd, e.g. f <- "data/ebd_relFeb-2018.txt"
f <- system.file("extdata/ebd-sample.txt", package = "auk")
# define filters
filters <- auk_ebd(f) \%>\%
  auk_species(species = c("Canada Jay", "Blue Jay")) \%>\%
  auk_country(country = c("US", "Canada")) \%>\%
  auk_bbox(bbox = c(-100, 37, -80, 52)) \%>\%
  auk_date(date = c("2012-01-01", "2012-12-31")) \%>\%
  auk_time(start_time = c("06:00", "09:00")) \%>\%
  auk_duration(duration = c(0, 60)) \%>\%
  auk_complete()
  
# alternatively, without pipes
ebd <- auk_ebd(system.file("extdata/ebd-sample.txt", package = "auk"))
filters <- auk_species(ebd, species = c("Canada Jay", "Blue Jay"))
filters <- auk_country(filters, country = c("US", "Canada"))
filters <- auk_bbox(filters, bbox = c(-100, 37, -80, 52))
filters <- auk_date(filters, date = c("2012-01-01", "2012-12-31"))
filters <- auk_time(filters, start_time = c("06:00", "09:00"))
filters <- auk_duration(filters, duration = c(0, 60))
filters <- auk_complete(filters)

# apply filters
\dontrun{
# output to a temp file for example
# in practice, provide path to output file
# e.g. f_out <- "output/ebd_filtered.txt"
f_out <- tempfile()
filtered <- auk_filter(filters, file = f_out)
str(read_ebd(filtered))
}
}
\seealso{
Other filter: \code{\link{auk_bbox}},
  \code{\link{auk_bcr}}, \code{\link{auk_breeding}},
  \code{\link{auk_complete}}, \code{\link{auk_country}},
  \code{\link{auk_date}}, \code{\link{auk_distance}},
  \code{\link{auk_duration}}, \code{\link{auk_extent}},
  \code{\link{auk_last_edited}}, \code{\link{auk_project}},
  \code{\link{auk_protocol}}, \code{\link{auk_species}},
  \code{\link{auk_state}}, \code{\link{auk_time}}
}
\concept{filter}
