% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/par-01-parSeqSim.R
\name{crossSetSim}
\alias{crossSetSim}
\title{Parallel Protein Sequence Similarity Calculation Between Two Sets
Based on Sequence Alignment (In-Memory Version)}
\usage{
crossSetSim(
  protlist1,
  protlist2,
  type = "local",
  cores = 2,
  batches = 1,
  verbose = FALSE,
  submat = "BLOSUM62",
  gap.opening = 10,
  gap.extension = 4
)
}
\arguments{
\item{protlist1}{A length \code{n} list containing \code{n} protein sequences,
each component of the list is a character string, storing one protein
sequence. Unknown sequences should be represented as \code{""}.}

\item{protlist2}{A length \code{n} list containing \code{m} protein sequences,
each component of the list is a character string, storing one protein
sequence. Unknown sequences should be represented as \code{""}.}

\item{type}{Type of alignment, default is \code{"local"},
  can be \code{"global"} or \code{"local"},
  where \code{"global"} represents Needleman-Wunsch global alignment;
\code{"local"} represents Smith-Waterman local alignment.}

\item{cores}{Integer. The number of CPU cores to use for parallel execution,
default is \code{2}. Users can use the \code{availableCores()} function
in the parallelly package to see how many cores they could use.}

\item{batches}{Integer. How many batches should we split the
similarity computations into. This is useful when you have a large
number of protein sequences, enough number of CPU cores, but not
enough RAM to compute and fit all the similarities
into a single batch. Defaults to 1.}

\item{verbose}{Print the computation progress?
Useful when \code{batches > 1}.}

\item{submat}{Substitution matrix, default is \code{"BLOSUM62"},
can be one of \code{"BLOSUM45"}, \code{"BLOSUM50"}, \code{"BLOSUM62"},
\code{"BLOSUM80"}, \code{"BLOSUM100"}, \code{"PAM30"},
\code{"PAM40"}, \code{"PAM70"}, \code{"PAM120"}, or \code{"PAM250"}.}

\item{gap.opening}{The cost required to open a gap of any length
in the alignment. Defaults to 10.}

\item{gap.extension}{The cost to extend the length of an existing
gap by 1. Defaults to 4.}
}
\value{
A \code{n} x \code{m} similarity matrix.
}
\description{
Parallel calculation of protein sequence similarity based on
sequence alignment between two sets of protein sequences.
}
\examples{
\dontrun{

# Be careful when testing this since it involves parallelization
# and might produce unpredictable results in some environments

library("Biostrings")
library("foreach")
library("doParallel")

s1 <- readFASTA(system.file("protseq/P00750.fasta", package = "protr"))[[1]]
s2 <- readFASTA(system.file("protseq/P08218.fasta", package = "protr"))[[1]]
s3 <- readFASTA(system.file("protseq/P10323.fasta", package = "protr"))[[1]]
s4 <- readFASTA(system.file("protseq/P20160.fasta", package = "protr"))[[1]]
s5 <- readFASTA(system.file("protseq/Q9NZP8.fasta", package = "protr"))[[1]]

plist1 <- list(s1 = s1, s2 = s2, s4 = s4)
plist2 <- list(s3 = s3, s4_again = s4, s5 = s5, s1_again = s1)
psimmat <- crossSetSim(plist1, plist2)
colnames(psimmat) <- names(plist1)
rownames(psimmat) <- names(plist2)
print(psimmat)
#                 s1         s2         s4
# s3       0.10236985 0.18858241 0.05819984
# s4_again 0.04921696 0.12124217 1.00000000
# s5       0.03943488 0.06391103 0.05714638
# s1_again 1.00000000 0.11825938 0.04921696
}
}
\author{
Sebastian Mueller <\url{https://alva-genomics.com}>
}
