% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/qc_filtering.R
\name{qc.filtering}
\alias{qc.filtering}
\title{Quality control filtering of molecular matrix M for downstream analyses}
\usage{
qc.filtering(
  M = NULL,
  base = FALSE,
  na.string = NA,
  map = NULL,
  marker = NULL,
  chrom = NULL,
  pos = NULL,
  ref = NULL,
  marker.callrate = 1,
  ind.callrate = 1,
  maf = 0,
  heterozygosity = 1,
  Fis = 1,
  impute = FALSE,
  Mrecode = FALSE,
  plots = TRUE,
  digits = 2,
  message = TRUE
)
}
\arguments{
\item{M}{A matrix with SNP data of full form (\eqn{n \times p}), with \eqn{n} individuals and \eqn{p} markers
Individual and marker names are assigned to \code{rownames} and \code{colnames}, respectively.
Data in matrix is coded as 0, 1, 2 (integer or numeric) (default = \code{NULL}).}

\item{base}{If \code{TRUE} matrix \eqn{\boldsymbol{M}} is considered as bi-allele SNP data format (character)
and the SNPs are recoded to numerical values before performing the quality control filters
(default = \code{FALSE}) (currently deprecated).}

\item{na.string}{A character that will be interpreted as \code{NA} values (default = \code{"NA"}).}

\item{map}{(Optional) A data frame with the map information with \eqn{p} rows (default = \code{NULL}).}

\item{marker}{A character indicating the name of the column in data frame \code{map} with the identification
of markers. This is mandatory if \code{map} is provided (default = \code{NULL}).}

\item{chrom}{A character indicating the name of the column in data frame \code{map} with the identification
of chromosomes (default = \code{NULL}).}

\item{pos}{A character indicating the name of the column in data frame \code{map} with the identification
of marker positions (default = \code{NULL}).}

\item{ref}{A character indicating the name of the column in the map containing the reference allele for
recoding. If absent, then conversion will be based on the major allele (most frequent).
The marker information of a given individuals with two of the specified major alleles
in \code{ref} will be coded as 2 (default = \code{NULL}).}

\item{marker.callrate}{A numerical value between 0 and 1 used to remove SNPs with a rate
of missing values equal or larger than this value (default = 1, \emph{i.e.} no removing).}

\item{ind.callrate}{A numerical value between 0 and 1 used to remove individuals with a
rate of missing values equal or larger than this value (default = 1, \emph{i.e.} no removing).}

\item{maf}{A numerical value between 0 and 1 used to remove SNPs with a Minor Allele Frequency
(MAF) below this value (default = 0, \emph{i.e.} no removing).}

\item{heterozygosity}{A numeric value indicating the maximum value of accepted observed heterozygosity (Ho)
(default = 1, \emph{i.e.} no removing).}

\item{Fis}{A numeric value indicating the maximum value of accepted inbreeding (Fis) following
the equation \eqn{|1 - (Ho/He)|} (default = 1, \emph{i.e.} no removing).}

\item{impute}{If \code{TRUE} imputation of missing values is done using the mean of each SNP
(default = \code{FALSE}).}

\item{Mrecode}{If \code{TRUE} it provides the recoded \eqn{\boldsymbol{M}} matrix from the bi-allelic to numeric SNP
(default = \code{FALSE}) (currently deprecated).}

\item{plots}{If \code{TRUE} generates graphical output of the quality control based on the
original input matrix (default = \code{TRUE}).}

\item{digits}{Set up the number of digits used to round the output matrix (default = 2).}

\item{message}{If \code{TRUE} diagnostic messages are printed on screen (default = \code{TRUE}).}
}
\value{
A list with the following elements:
\itemize{
\item{\code{M.clean}: the cleaned \eqn{\boldsymbol{M}} matrix after the quality control filters have been applied.}
\item{\code{map}: if provided, a cleaned \code{map} data frame after the quality control filters have been applied.}
\item{\code{plot.missing.ind}: a plot of missing data per individual (original marker matrix).}
\item{\code{plot.missing.SNP}: a plot of missing data per SNP (original marker matrix).}
\item{\code{plot.heteroz}: a plot of observed heterozygocity per SNP (original marker matrix).}
\item{\code{plot.Fis}: a plot of Fis per SNP (original marker matrix).}
\item{\code{plot.maf}: a plot of the minor allele frequency (original marker matrix).}
}
}
\description{
Reads molecular data in the format 0, 1, 2 and performs some basic quality control
filters and simple imputation.
Matrix provided is of the full form (\eqn{n \times p}), with \eqn{n} individuals and \eqn{p} markers.
Individual and marker names are assigned to \code{rownames} and \code{colnames},
respectively. Filtering can be done with the some of the following options by
specifying thresholds for:
missing values on individuals, missing values on markers, minor allele frequency,
inbreeding Fis value (of markers), and observed heterozygosity (of markers).
String used for identifying missing values can be specified.
If requested, missing values will be imputed based on the mean of each SNP.
}
\details{
\strong{Warning}: The arguments \code{base}, \code{ref}, and \code{Mrecode}
currently are deprecated and will
be removed on the next version of \code{ASRgenomics}.
Use function \link{snp.recode} to recode the matrix prior to using \code{qc.filtering}.

The filtering process is carried out as expressed in the following simplified pseudo-code
that consists on a loop repeated twice:

\strong{for i in 1 to 2}

    Filter markers based on call rate.

    Filter individuals based on call rate.

    Filter markers based on minor allele frequency.

    Filter markers based on observed heterozygosity.

    Filter markers based on inbreeding.

\strong{end for}
}
\examples{
# Example: Pine dataset from ASRgenomics (coded as 0,1,2 with missing as -9).

M.clean <- qc.filtering(
 M = geno.pine926,
 maf = 0.05,
 marker.callrate = 0.9, ind.callrate = 0.9,
 heterozygosity = 0.9, Fis = 0.6,
 na.string = "-9")
ls(M.clean)
M.clean$M.clean[1:5, 1:5]
dim(M.clean$M.clean)
head(M.clean$map)
M.clean$plot.maf
M.clean$plot.missing.ind
M.clean$plot.missing.SNP
M.clean$plot.heteroz
M.clean$plot.Fis

\donttest{
# Example: Salmon dataset (coded as 0,1,2 with missing as NA).

M.clean <- qc.filtering(
 M = geno.salmon,
 maf = 0.02,
 marker.callrate = 0.10, ind.callrate = 0.20,
 heterozygosity = 0.9, Fis = 0.4)
M.clean$M.clean[1:5, 1:5]
dim(M.clean$M.clean)
head(M.clean$map)
M.clean$plot.maf
M.clean$plot.missing.ind
M.clean$plot.missing.SNP
M.clean$plot.heteroz
M.clean$plot.Fis
}

}
