% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read-html.R
\name{ragnar_read_document}
\alias{ragnar_read_document}
\title{Read an HTML document}
\usage{
ragnar_read_document(
  x,
  ...,
  split_by_tags = frame_by_tags,
  frame_by_tags = NULL
)
}
\arguments{
\item{x}{file path or url, passed on to \code{rvest::read_html()}, or an \code{xml_node}.}

\item{...}{passed on to \code{rvest::read_html()}}

\item{split_by_tags}{character vector of html tag names used to split the
returned text}

\item{frame_by_tags}{character vector of html tag names used to create a
dataframe of the returned content}
}
\value{
If \code{frame_by_tags} is not \code{NULL}, then a data frame is returned,
with column names \code{c("frame_by_tags", "text")}.

If \code{frame_by_tags} is \code{NULL} but \code{split_by_tags} is not \code{NULL}, then a named
character vector is returned.

If both \code{frame_by_tags} and \code{split_by_tags} are \code{NULL}, then a string
(length-1 character vector) is returned.
}
\description{
Read an HTML document
}
\examples{
file <- tempfile(fileext = ".html")
download.file("https://r4ds.hadley.nz/base-R.html", file, quiet = TRUE)

# with no arguments, returns a single string of the text.
file |> ragnar_read_document() |> str()

# use `split_by_tags` to get a named character vector of length > 1
file |>
  ragnar_read_document(split_by_tags = c("h1", "h2", "h3")) |>
  tibble::enframe("tag", "text")

# use `frame_by_tags` to get a dataframe where the
# headings associated with each text chunk are easily accessible
file |>
  ragnar_read_document(frame_by_tags = c("h1", "h2", "h3"))

# use `split_by_tags` and `frame_by_tags` together to further break up `text`.
file |>
  ragnar_read_document(
    split_by_tags = c("p"),
    frame_by_tags = c("h1", "h2", "h3")
  )

# Example workflow adding context to each chunk
file |>
  ragnar_read_document(frame_by_tags = c("h1", "h2", "h3")) |>
  glue::glue_data(r"--(
    ## Excerpt from the book "R for Data Science (2e)"
    chapter: {h1}
    section: {h2}
    content: {text}

    )--") |>
    # inspect
    _[6:7] |> cat(sep = "\n~~~~~~~~~~~\n")

# Advanced example of postprocessing the output of ragnar_read_document()
# to wrap code blocks in backticks, markdown style
library(dplyr, warn.conflicts = FALSE)
library(stringr)
library(rvest)
library(xml2)
file |>
  ragnar_read_document(frame_by_tags = c("h1", "h2", "h3"),
                       split_by_tags = c("p", "pre")) |>
  mutate(
    is_code = tag == "pre",
    text = ifelse(is_code,
                  str_c("```", text, "```", sep = "\n"),
                  text)) |>
  group_by(h1, h2, h3) |>
  summarise(text = str_flatten(text, "\n"), .groups = "drop") |>
  glue::glue_data(r"--(
    # Excerpt from the book "R for Data Science (2e)"
    chapter: {h1}
    section: {h2}
    content: {text}

    )--") |>
    # inspect
    _[9:10] |> cat(sep = "\n~~~~~~~~~~~\n")

# Example of preprocessing the input to ragnar_read_document()
# to wrap code in backticks, markdown style
# same outcome as above, except via pre processing instead of post processing.
file |>
  read_html() |>
  (\(doc) {
    # fence preformatted code with triple backticks
    for (node in html_elements(doc, "pre")) {
      xml_add_child(node, "code", "```\n", .where = 0)
      xml_add_child(node, "code", "\n```")
    }
    # wrap inline code with single backticks
    for (node in html_elements(doc, "code")) {
      if (!"pre" \%in\% xml_name(xml_parents(node))) {
        xml_text(node) <- str_c("`", xml_text(node), "`")
      }
    }
    doc
  })() |>
  ragnar_read_document(frame_by_tags = c("h1", "h2", "h3")) |>
  glue::glue_data(r"--(
    # Excerpt from the book "R for Data Science (2e)"
    chapter: {h1}
    section: {h2}
    content: {text}

    )--") |> _[6]
}
