% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read_pdf.R
\name{read_pdf}
\alias{read_pdf}
\title{Read a Portable Document Format into R}
\usage{
read_pdf(file, skip = 0)
}
\arguments{
\item{file}{A path to a PDF file.}

\item{skip}{Integer; the number of lines of the data file to skip before
beginning to read data.}
}
\value{
Returns a \code{\link[base]{data.frame}} with the page number
(\code{page_id}), line number (\code{element_id}), and the \code{text}.
}
\description{
A wrapper for \code{\link[pdftools]{pdf_text}} to read PDFs into R.
}
\note{
A word of caution from \href{http://stackoverflow.com/a/9187015/1000343}{Carl Witthoft}"
"Just a warning to others who may be hoping to extract data: PDF is a
container, not a format. If the original document does not contain actual
text, as opposed to bitmapped images of text or possibly even uglier things
than I can imagine, nothing other than OCR can help you."
}
\examples{
pdf_dat <- read_pdf(
    system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr")
)

pdf_dat_b <- read_pdf(
    system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr"),
    skip = 1
)

\dontrun{
library(textshape)
system.file("docs/rl10075oralhistoryst002.pdf", package = "textreadr") \%>\%
    read_pdf(1) \%>\%
    `[[`('text') \%>\%
    head(-1) \%>\%
    textshape::combine() \%>\%
    gsub("([A-Z])( )([A-Z])", "\\\\1_\\\\3", .) \%>\%
    strsplit("(-| )(?=[A-Z_]+:)", perl=TRUE) \%>\%
    `[[`(1) \%>\%
    textshape::split_transcript()
}
}
\seealso{
\code{\link[tm]{readPDF}}
}
\keyword{pdf}

