% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DSD_ReadStream.R
\name{DSD_ReadStream}
\alias{DSD_ReadStream}
\alias{DSD_ReadCSV}
\alias{close_stream.DSD_ReadStream}
\alias{close_stream.DSD_ReadCSV}
\title{Read a Data Stream from a File or a Connection}
\usage{
DSD_ReadStream(
  file,
  k = NA,
  take = NULL,
  sep = ",",
  header = FALSE,
  skip = 0,
  col.names = NULL,
  colClasses = NA,
  outofpoints = c("warn", "ignore", "stop"),
  ...
)

DSD_ReadCSV(
  file,
  k = NA,
  take = NULL,
  sep = ",",
  header = FALSE,
  skip = 0,
  col.names = NULL,
  colClasses = NA,
  outofpoints = c("warn", "ignore", "stop"),
  ...
)

\method{close_stream}{DSD_ReadStream}(dsd, ...)

\method{close_stream}{DSD_ReadCSV}(dsd, ...)
}
\arguments{
\item{file}{A file/URL or an open connection.}

\item{k}{Number of true clusters, if known.}

\item{take}{indices of columns to extract from the file.}

\item{sep}{The character string that separates dimensions in data points in
the stream.}

\item{header}{Does the first line contain variable names?}

\item{skip}{the number of lines of the data file to skip before beginning to
read data.}

\item{col.names}{A vector of optional names for the variables. The default is to use \code{"V"} followed by the
column number. Additional information (e.g., class labels) need to have names starting with \code{.}.}

\item{colClasses}{A vector of classes to be assumed for the columns passed
on to \code{\link[=read.table]{read.table()}}.}

\item{outofpoints}{Action taken if less than \code{n} data points are
available. The default is to return the available data points with a warning. Other supported actions are:
\itemize{
\item \code{warn}: return the available points (maybe an empty data.frame) with a warning.
\item \code{ignore}: silently return the available points.
\item \code{stop}: stop with an error.
}}

\item{...}{Further arguments are passed on to \code{\link[=read.table]{read.table()}}.  This can
for example be used for encoding, quotes, etc.}

\item{dsd}{A object of class \code{DSD_ReadCSV}.}
}
\value{
An object of class \code{DSD_ReadCSV} (subclass of \link{DSD_R}, \link{DSD}).
}
\description{
A DSD class that reads a data stream (text format) from a file or any R connection.
}
\details{
\code{DSD_ReadStream} uses \code{\link[=readLines]{readLines()}} and \code{\link[=read.table]{read.table()}} to read data from an R
connection line-by-line and convert it into a data.frame.
The connection is responsible for maintaining where the stream
is currently being read from. In general, the connections will consist of
files stored on disk but have many other possibilities (see
\link{connection}).

The implementation tries to gracefully deal with slightly corrupted data by
dropping points with inconsistent reading and producing a warning. However,
this might not always be possible resulting in an error instead.

\strong{Column names}

If the file has column headers in the first line, then they can be used by setting \code{header = TRUE}.
Alternatively, column names can be set using \code{col.names} or a named vector for \code{take}. If no column
names are specified then default names will be created.

Columns with names that start with \code{.} are considered information columns and are ignored by \code{DST}s.
See \code{\link[=get_points]{get_points()}} for details.

Other information columns are are used by various functions.

\strong{Reading the whole stream}
By using \code{n = -1} in \code{get_points()}, the whole stream is returned.

\strong{Resetting and closing a stream}

The position in the file can be reset to the beginning or another position using
\code{\link[=reset_stream]{reset_stream()}}. This fails of the underlying connection is not seekable (see \link{connection}).

\code{DSD_ReadStream} maintains an open connection to the stream and needs to be closed
using \code{\link[=close_stream]{close_stream()}}.

\code{DSD_ReadCSV} reads a stream from a comma-separated values file.
}
\examples{
# Example 1: creating data and writing it to disk
stream <- DSD_Gaussians(k = 3, d = 2)
write_stream(stream, "data.txt", n = 100, info = TRUE, header = TRUE)
readLines("data.txt", n = 5)

# reading the same data back
stream2 <- DSD_ReadStream("data.txt", header = TRUE)
stream2

# get points
get_points(stream2, n = 5)
plot(stream2, n = 20)

# clean up
close_stream(stream2)
file.remove("data.txt")

# Example 2:  Read part of the kddcup1999 data (take only cont. variables)
# col 42 is the class variable
file <- system.file("examples", "kddcup10000.data.gz", package = "stream")
stream <- DSD_ReadCSV(gzfile(file),
        take = c(1, 5, 6, 8:11, 13:20, 23:41, .class = 42), k = 7)
stream

get_points(stream, 5)

# plot 100 points (projected on the first two principal components)
plot(stream, n = 100, method = "pca")

close_stream(stream)
}
\seealso{
\code{\link[=readLines]{readLines()}}, \code{\link[=read.table]{read.table()}}.

Other DSD: 
\code{\link{DSD}()},
\code{\link{DSD_BarsAndGaussians}()},
\code{\link{DSD_Benchmark}()},
\code{\link{DSD_Cubes}()},
\code{\link{DSD_Gaussians}()},
\code{\link{DSD_MG}()},
\code{\link{DSD_Memory}()},
\code{\link{DSD_Mixture}()},
\code{\link{DSD_NULL}()},
\code{\link{DSD_ReadDB}()},
\code{\link{DSD_Target}()},
\code{\link{DSD_UniformNoise}()},
\code{\link{DSD_mlbenchData}()},
\code{\link{DSD_mlbenchGenerator}()},
\code{\link{DSF}()},
\code{\link{animate_data}()},
\code{\link{close_stream}()},
\code{\link{get_points}()},
\code{\link{plot.DSD}()},
\code{\link{reset_stream}()}
}
\author{
Michael Hahsler
}
\concept{DSD}
