% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/chunks.R
\name{pub_chunks}
\alias{pub_chunks}
\title{Extract chunks of data from articles}
\usage{
pub_chunks(x, sections = "all", provider = NULL)
}
\arguments{
\item{x}{one of the following:
\itemize{
\item file path for an XML file
\item a character string of XML, a list (of file paths, or XML in a character
string, or \code{xml_document} objects)
\item or an object of class \code{fulltext::ft_data}, the output from a call to
\code{fulltext::ft_get()}
}}

\item{sections}{(character) What elements to get, can be one or more in
a vector or list. See \code{\link[=pub_sections]{pub_sections()}} for options. optional. Default is
to get all sections. See Details.}

\item{provider}{(character) a single publisher name. see
\code{\link[=pub_providers]{pub_providers()}} for options. required. If you select the wrong provider
for the XML you have you may or may not get what you need :). By default
this is \code{NULL} and we use \code{\link[=pub_guess_publisher]{pub_guess_publisher()}} to guess the
publisher; we may get it wrong. You can override our guessing by passing
in a name.}
}
\value{
A list, named by the section selected. sections not found or
not in accepted list return \code{NULL} or zero length list. A ".publisher"
list element gets attached to each list output, even when no
data is found. When \code{fulltext::ft_get} output is passed in here, the
list is named by the publisher, then within each publisher is a list
of articles named by their identifiers (e.g. DOIs).
}
\description{
\code{pub_chunks} makes it easy to extract sections of an article.
You can extract just authors across all articles, or all references
sections, or the complete text of each article. Then you can pass the
output downstream for visualization and analysis.
}
\details{
Options for the \code{sections} parameter:
\itemize{
\item front - Publisher, journal and article metadata elements
\item body - Body of the article
\item back - Back of the article, acknowledgments, author contributions,
references
\item title - Article title
\item doi - Article DOI
\item categories - Publisher's categories, if any
\item authors - Authors
\item aff - Affiliation (includes author names)
\item keywords - Keywords
\item abstract - Article abstract
\item executive_summary - Article executive summary
\item refs - References
\item refs_dois - References DOIs - if available
\item publisher - Publisher name
\item journal_meta - Journal metadata
\item article_meta - Article metadata
\item acknowledgments - Acknowledgments
\item permissions - Article permissions
\item history - Dates, recieved, published, accepted, etc.
}
}
\examples{
# a file path to an XML file
x <- system.file("examples/elsevier_1.xml", package = "pubchunks")
pub_chunks(x, "title")
pub_chunks(x, "authors")
pub_chunks(x, "acknowledgments")
pub_chunks(x, "refs")
pub_chunks(x, c("title", "refs"))

# works the same with the xml already in a string
xml <- paste0(readLines(x), collapse = "")
pub_chunks(xml, "title")

# also works if you've already read in the XML (with xml2 pkg)
xml <- paste0(readLines(x), collapse = "")
xml <- xml2::read_xml(xml)
pub_chunks(xml, "title")

# Hindawi
x <- system.file("examples/hindawi_1.xml", package = "pubchunks")
pub_chunks(x, "abstract")
pub_chunks(x, "authors")
pub_chunks(x, "aff")
pub_chunks(x, "title")
pub_chunks(x, c("abstract", "title", "authors", "refs"))

# Pensoft
x <- system.file("examples/pensoft_1.xml", package = "pubchunks")
pub_chunks(x, "abstract")
pub_chunks(x, "aff")
pub_chunks(x, "title")
pub_chunks(x, c("abstract", "title", "authors", "refs"))

# Peerj
x <- system.file("examples/peerj_1.xml", package = "pubchunks")
pub_chunks(x, "abstract")
pub_chunks(x, "authors")
pub_chunks(x, "aff")
pub_chunks(x, "title")
pub_chunks(x, c("abstract", "title", "authors", "refs"))

# Copernicus
x <- system.file("examples/copernicus_1.xml", package = "pubchunks")
pub_chunks(x, c("doi", "abstract", "title", "authors", "refs"))
pub_chunks(x, "aff")

# Frontiers
x <- system.file("examples/frontiers_1.xml", package = "pubchunks")
pub_chunks(x, "authors")
pub_chunks(x, "aff")
pub_chunks(x, c("doi", "abstract", "title", "authors", "refs"))

# eLife
x <- system.file("examples/elife_1.xml", package = "pubchunks")
pub_chunks(x, "authors")
pub_chunks(x, "aff")
pub_chunks(x, c("doi", "title", "authors", "refs"))

# f1000research
x <- system.file("examples/f1000research_1.xml", package = "pubchunks")
pub_chunks(x, "title")
pub_chunks(x, "aff")
pub_chunks(x, c("doi", "title", "authors", "keywords", "refs"))

# Many inputs at once
x <- system.file("examples/frontiers_1.xml", package = "pubchunks")
y <- system.file("examples/elife_1.xml", package = "pubchunks")
z <- system.file("examples/f1000research_1.xml", package = "pubchunks")
pub_chunks(list(x, y, z), c("doi", "title", "authors", "refs"))

\dontrun{
# using output of fulltext::ft_get()
if (requireNamespace("fulltext", quietly = TRUE)) {
  # single
  x <- fulltext::ft_get('10.7554/eLife.03032')
  pub_chunks(fulltext::ft_collect(x), sections="authors")

  # many
  dois <- c('10.1371/journal.pone.0086169', '10.1371/journal.pone.0155491', 
    '10.7554/eLife.03032')
  x <- fulltext::ft_get(dois)
  pub_chunks(fulltext::ft_collect(x), sections="authors")
}
}
}
