
##
## data interfaces to cSPADE
##
## ceeboo 2007, 2008

.as_integer <- function(x) {
    ## preserve factor
    if (typeof(x) != "integer") {
        ## must be atomic
        x <- factor(x)
        l <- suppressWarnings(as.integer(levels(x)))
        ## implicit coercion
        if (!any(is.na(l)) && all(l == levels(x)))
            x <- l[c(x)]
    }
    x
}

read_baskets <- function(con, sep = "[ \t]+", info = NULL, iteminfo = NULL) {
    x <- readLines(con)
    x <- sub("^[ \t]+", "", x)
    x <- strsplit(x, split = sep)
    if (!is.null(info)) {
        i <- info
        info <- lapply(seq(length(info)), function(k) sapply(x, "[", k))
        names(info) <- i
        x <- lapply(x, "[", -seq(length(info)))
        # fixme: warning
        x <- lapply(x, unique)
    }
    x <- as(x, "transactions")
    if (!is.null(info)) {
        if (!is.null(info$sequenceID))
            info$sequenceID <- .as_integer(info$sequenceID)
        if (!is.null(info$eventID))
            info$eventID <- .as_integer(info$eventID)
        if (is.factor(info$eventID))
            warning("'eventID' is a factor")
        transactionInfo(x) <- data.frame(info)
    }
    if (!is.null(iteminfo)) {
        if (!is.data.frame(iteminfo))
            stop("'iteminfo' not a data frame")
        labels <- itemLabels(x)
        if (!all(labels %in% rownames(iteminfo)))
            stop("the row names of 'iteminfo' do not match the item labels")
        iteminfo <- iteminfo[labels,, drop = FALSE]
        if ("labels" %in% names(iteminfo))
            iteminfo$labels <- as.character(iteminfo$labels)
        else
            iteminfo <- cbind(x@itemInfo, iteminfo)
        itemInfo(x) <- iteminfo
    }
    x
}

## currently internal only

read_spade <- function(con = "", decode = FALSE, labels = NULL) {
    if (con == "")
        con <- stdin()
    else 
    if (is.character(con)) {
        con <- file(con, "r")
        on.exit(close(con))
    }
    if (!inherits(con, "connection")) 
        stop("'con' must be a character string or connection.")

    n <- readLines(con, 1)
    if (!length(n))
        stop("the number of lines is zero")
    n <- as.integer(strsplit(n, " ")[[1]][5])
        
    x <- readLines(con)
    if (!length(x))
        return(new("sequences", info = list(nsequences = n)))

    x <- strsplit(x, split = " -- ")
   
    # FIXME there are 2 counts

    c <- sapply(x, "[", 2)
    c <- as.integer(sapply(strsplit(c, split = " "), "[", 1))
   
    # split into a list of lists (sequences) each 
    # containing a vector of character (itemsets)
   
    x <- lapply(strsplit(sapply(x, "[", 1), split = " -> "), strsplit, " ")
    if (decode)
        x <- lapply(x, lapply, as.integer)

    if (!length(x))
        stop("the number of sequences parsed is zero")

    x <- as(x, "sequences")
    x@quality <- data.frame(support = c / n)
    x@info <- list(nsequences = n)

    k <- which(size(x) == 1)
    if (length(k) == length(x@elements)) {
        i <- x@data[,k]@i + 1L
        k[i] <- k
        quality(x@elements) <- x@quality[k,, drop = FALSE]
    } else
        stop("the data is incomplete")

    if (!is.null(labels)) {
        k <- as.integer(as.character(x@elements@items@itemInfo$labels))
        itemLabels(x@elements@items) <- as.character(labels[k])
    }
    validObject(x)
    x
}

## write data in text format for later
## processing by exec/makebin

write_cspade <- function(x, con) {
    if (!inherits(x, "transactions"))
        stop("'x' not of class transactions")

    r <- .Call("R_asList_ngCMatrix", x@data, NULL)
    r <- sapply(r, paste, collapse = " ")
    
    sid <- .as_integer(x@transactionInfo$sequenceID)
    if (any(sort(sid) != sid))
        stop("sequenceID not in ascending order")
    eid <- .as_integer(x@transactionInfo$eventID)
    if (is.factor(eid))
        warning("'eventID' is a factor")
    if (any(tapply(eid, sid, function(x) any(sort(x) != x))))
        stop("eventID not in blockwise ascending order")

    r <- rbind(as.character(as.integer(sid)),
               as.character(as.integer(eid)),
               as.character(size(x)), r)
    r <- apply(r, 2, paste, collapse = " ")
    
    writeLines(r, con)
}

## write data directly in binary format for
## later processing by exec/exttpose

makebin <- function(x, file) {
    if (!inherits(x, "transactions"))
        stop("'x' not of class transactions")

    sid <- .as_integer(x@transactionInfo$sequenceID)
    eid <- .as_integer(x@transactionInfo$eventID)
    if (is.factor(eid))
        warning("'eventID' is a factor")

    x <- as(x, "ngCMatrix")
    attr(x, "sid") <- sid
    attr(x, "eid") <- eid

    .Call("R_makebin", x, file)
}

## cSPADE wrapper
##
## note that we assume 1MB = 2^10 x 2^10 = 4^10 for the 
## computation of the number of database partitions.
##
## FIXME the output redirection in system does not work
##       under Windoze.

cspade <- 
function(data, parameter = NULL, control = NULL, tmpdir = tempdir()) {

    if (!inherits(data, "transactions"))
        stop("'data' not of class transactions")
    if (!all(c("sequenceID", "eventID") %in% names(transactionInfo(data))))
        stop("slot transactionInfo: missing 'sequenceID' or 'eventID'")
    if (!all(dim(data))) 
        return(new("sequences"))
    parameter <- as(parameter, "SPparameter")
    control   <- as(control ,  "SPcontrol")

    if (control@verbose) {
        t1 <- proc.time()
        cat("\nparameter specification:\n")
        cat(.formatSP(parameter), sep = "\n")
        cat("\nalgorithmic control:\n")
        cat(.formatSP(control), sep = "\n")
        cat("\npreprocessing ...")
    }

    exe <- system.file("exec", package = "arulesSequences")

    file <- tempfile(pattern = "cspade", tmpdir)
    on.exit(unlink(paste(file, "*", sep = ".")))

    #out <- paste(file, "asc", sep = ".")
    #write_cspade(data, con = out)

    ## preprocess
    opt <- ""
    nop <- ceiling((dim(data)[1] + 2 * length(data@data@i))
	 * .Machine$sizeof.long / 4^10 / 5)
    if (length(control@memsize)) {
        opt <- paste("-m", control@memsize)
        nop <- ceiling(nop * 32 / control@memsize)
    }
    if (length(control@numpart)) {
        if (control@numpart < nop)
            warning("'numpart' less than recommended")
        nop <- control@numpart
    }
    log <- "summary.out"
    if (#system(paste(file.path(exe, "makebin"), 
        #    out, paste(file, "data", sep = "."))) ||
        #system(paste(file.path(exe, "getconf"), 
        #    "-i", file, "-o", file, ">>", log))   ||
        !makebin(data, file) ||
        system(paste(file.path(exe, "exttpose"),
            "-i", file, "-o", file, "-p", nop, opt, "-l -x -s",
            parameter@support, ">>", log))
       ) stop("system invocation failed")

    ## options
    if (length(parameter@maxsize))
        opt <- paste(opt, "-Z", parameter@maxsize, collapse = "")
    if (length(parameter@maxlen))
        opt <- paste(opt, "-z", parameter@maxlen,  collapse = "")
    if (length(parameter@mingap))
        opt <- paste(opt, "-l", parameter@mingap,  collapse = "")
    if (length(parameter@maxgap))
        opt <- paste(opt, "-u", parameter@maxgap,  collapse = "")
    if (length(parameter@maxwin))
        opt <- paste(opt, "-w", parameter@maxwin,  collapse = "")

    if (!length(control@bfstype) || !control@bfstype)
        opt <- paste(opt, "-r", collapse = "")

    if (control@verbose) {
        t2 <- proc.time()
        du <- sum(file.info(list.files(path = dirname(file),
            pattern = basename(file), full.names = TRUE))$size)
        cat(paste("", nop, "partition(s),",
            round(du / 4^10, digits = 2), "MB"))
        cat(paste(" [",format((t2-t1)[3], digits =2, format = "f"),
                  "s]", sep = ""))
        cat("\nmining transactions ...")
    }

    out <- paste(file, "out", sep = ".")
    if (system(paste(file.path(exe, "spade"),
            "-i", file, "-s", parameter@support, opt, "-e", nop, "-o >", out))
       ) stop("system invocation failed")

    if (control@verbose) {
        t3 <- proc.time()
        du <- file.info(out)$size
        cat(paste("", round(du / 4^10, digits = 2), "MB"))
        cat(paste(" [",format((t3-t2)[3], digits =2, format = "f"),
                  "s]", sep = ""))
        cat("\nreading sequences ...")
    }

    out <- read_spade(con = out, labels = itemLabels(data))

    out@info <- c(
        data = match.call()$data,
        ntransactions = length(data),
        out@info,
        support = parameter@support
    )

    if (control@verbose) {
        t4 <- proc.time()
        cat(paste(" [",format((t4-t3)[3], digits =2, format = "f"),
                  "s]", sep = ""))
        cat("\n\ntotal elapsed time: ", (t4-t1)[3], "s\n", sep ="")
    }
    if (!control@summary)
        unlink("summary.out")

    out
}

###
