% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/PredictLogReg.R
\name{PredictLogReg}
\alias{PredictLogReg}
\title{Prediction of the category to which a mass spectrum belongs}
\usage{
PredictLogReg(
  peaks,
  model,
  moz,
  tolerance = 6,
  toleranceStep = 2,
  normalizeFun = TRUE,
  noMatch = 0,
  Reference = NULL,
  chunk_size = 10000L,
  ncores = 1L,
  verbose = FALSE
)
}
\arguments{
\item{peaks}{a list of MALDIquant::MassPeaks objects (one per spectrum).}

\item{model}{a model or a list of models estimated from a shortlist of m/z
(e.g., the output of LogReg). Each model must support predict(..., type = "prob").
If a single model is supplied, it is wrapped into a one-element list.}

\item{moz}{a numeric vector of shortlisted m/z values used for prediction
(typically the selection from SelectionVar/SelectionVarStat_fast).}

\item{tolerance}{numeric; accepted m/z tolerance (in Da) for matching peaks to
moz. Default 6.}

\item{toleranceStep}{numeric; if a spectrum yields no matches at the initial
tolerance, the function retries by increasing tolerance in steps of
\code{toleranceStep} until at least one match is found (bounded internally). Default 2.}

\item{normalizeFun}{logical; if TRUE (default), per-spectrum max normalization
is applied after matching (row is divided by its maximum).}

\item{noMatch}{numeric; intensity used when no peak matches a given m/z. Default 0.}

\item{Reference}{optional factor of true categories, length equal to length(peaks).
If provided and has at least two distinct levels, the function returns, in
addition to the predictions, per-model confusion matrices (caret::confusionMatrix).}

\item{chunk_size}{integer; number of spectra per prediction batch (rows of X).
Large datasets can be processed in chunks to limit memory. Default 10000.}

\item{ncores}{integer; number of cores used while building X from peaks on the
R side (the C++ matching itself is single-threaded here). Default 1.}

\item{verbose}{logical; print progress messages. Default FALSE.}
}
\value{
If Reference is missing (or has < 2 levels), a data.frame with:
\itemize{
\item name: spectrum name (from MassPeaks metaData fullName/file when available)
\item method: model identifier (from model$method; suffixed with "_i" if needed)
\item one column per class with predicted probabilities
\item pred_max_p: predicted class (argmax of probabilities)
}

If Reference is provided with at least two levels, a list with:
\itemize{
\item Prob.results: the predictions data.frame as above
\item Confusion.Matrix: a list of caret::confusionMatrix objects (one per method)
}
}
\description{
Predicts the category (species, phenotype, etc.) of each spectrum in a list
of MALDIquant MassPeaks using one or more trained models (e.g., multinomial
logistic regression from LogReg). Peaks are matched to a given shortlist of
discriminant m/z values (moz) within a tolerance; unmatched positions are
filled with \code{noMatch}. If several models are supplied, the function also
produces meta-predictions: per-class Fisher combinations (if 'metap' is
available) and a majority-vote fraction across models.
}
\details{
\itemize{
\item Matching and normalization: peak-to-moz matching is performed by build_X_from_peaks_fast
(C++-backed; nearest-within-tolerance). If no m/z from a spectrum match the
shortlist initially, the tolerance is increased by \code{toleranceStep} in a small
number of attempts until at least one match is found. If \code{normalizeFun = TRUE},
each row is divided by its maximum (guarded to avoid divide-by-zero).
\item Multiple models: when several models are supplied, the output contains one
set of probabilities per model (with method column identifying it). Two
additional rows per spectrum can be appended:
\itemize{
\item comb_fisher: per-class Fisher combined p-values computed via metap::sumlog (if available).
\item max_vote: per-class fraction of models casting the top-probability vote for that class.
}
\item Models: this function is agnostic of the modeling engine as long as predict(type = "prob")
is implemented (e.g., caret multinom/nnet/ranger/xgb, glmnet, etc.).
}
}
\examples{
\donttest{
library(MSclassifR)
library(MALDIquant)

## 1) Preprocess and detect peaks
data("CitrobacterRKIspectra", "CitrobacterRKImetadata", package = "MSclassifR")
spectra <- SignalProcessing(CitrobacterRKIspectra)
peaks   <- MSclassifR::PeakDetection(x = spectra, averageMassSpec = FALSE)

## 2) Build X and Y (sample-by-peak intensities + labels)
##    Option A: if you prefer the helper and a sparse return:
Y <- factor(CitrobacterRKImetadata$Species)
xy <- build_XY_from_peaks(peaks, labels = Y, normalize = "max", sparse = FALSE)
X <- xy$X
Y <- xy$Y

##    Option B: via MALDIquant::intensityMatrix (as in the original examples)
##IntMat <- MALDIquant::intensityMatrix(peaks)
##rownames(IntMat) <- paste(CitrobacterRKImetadata$Strain_name_spot)
##IntMat[is.na(IntMat)] <- 0
##IntMat <- t(apply(IntMat, 1, function(x) x / max(x)))  # per-spectrum max norm
##X <- t(IntMat)                                         # features in columns
##Y <- factor(CitrobacterRKImetadata$Species)

## 3) Select discriminant m/z with "cvp" method
a <- MSclassifR::SelectionVar(
  X, Y,
  MethodSelection = "cvp",
  MethodValidation = "cv",
  PreProcessing = c("center","scale","nzv","corr"),
  NumberCV = 2,
  Metric = "Kappa"
)
sel_moz <- a$sel_moz

## 4) Train several models on the shortlisted m/z
model_lm  <- MSclassifR::LogReg(X = X, moz = sel_moz, Y = Y, number = 2,
 repeats = 2, Metric = "Kappa", kind = "linear")
model_nn  <- MSclassifR::LogReg(X = X, moz = sel_moz, Y = Y, number = 2,
 repeats = 2, Metric = "Kappa", kind = "nnet", Sampling = "up")
model_rf  <- MSclassifR::LogReg(X = X, moz = sel_moz, Y = Y, number = 2,
 repeats = 2, Metric = "Kappa", kind = "rf",  Sampling = "down")
model_svm <- MSclassifR::LogReg(X = X, moz = sel_moz, Y = Y, number = 2,
 repeats = 2, Metric = "Kappa", kind = "svm", Sampling = "up")

Models <- list(
  model_lm$train_mod,
  model_nn$train_mod,
  model_rf$train_mod,
  model_svm$train_mod
)

## 5) Predict classes for a subset of peaks; 6 Da tolerance for matching
prob_cat <- MSclassifR::PredictLogReg(
  peaks     = peaks[1:5],
  model     = Models,
  moz       = sel_moz,
  tolerance = 6,
  Reference = Y[1:5]
)
prob_cat

## 6) Meta-classifier strategy (several RF models + SMOTE + Fisher combine)
a2 <- MSclassifR::SelectionVar(X, Y, MethodSelection = "mda", Ntree = 5 * ncol(X))
sel_moz2 <- a2$sel_moz
models2 <- vector("list", 4L)
for (i in seq_along(models2)) {
  models2[[i]] <- MSclassifR::LogReg(
    X = X, moz = sel_moz2, Y = Y,
    number = 5, repeats = 5,
    kind = "rf", Metric = "Kappa",
    Sampling = "smote"
  )$train_mod
}
prob_cat2 <- MSclassifR::PredictLogReg(
  peaks = peaks,
  model = models2,
  moz   = sel_moz2,
  tolerance = 6,
  Reference = Y
)
}

}
\references{
Kuhn, M. (2008). Building predictive models in R using the caret package. Journal of Statistical Software, 28(1), 1–26.

Alexandre Godmer, Yahia Benzerara, Emmanuelle Varon, Nicolas Veziris, Karen Druart,
Renaud Mozet, Mariette Matondo, Alexandra Aubry, Quentin Giai Gianetto (2025).
MSclassifR: An R package for supervised classification of mass spectra with machine learning methods.
Expert Systems with Applications, 294, 128796. doi:10.1016/j.eswa.2025.128796
}
\seealso{
LogReg; SelectionVar; SelectionVarStat_fast; build_X_from_peaks_fast
}
