## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment  = "#>"
)

## ----install, eval = FALSE----------------------------------------------------
# # From CRAN
# install.packages("PONG2")
# 
# # From GitHub
# remotes::install_github("NormanLabUCD/PONG2")

## ----load---------------------------------------------------------------------
library(PONG2)
packageVersion("PONG2")

## ----example_data-------------------------------------------------------------
# Load built-in example data
data(PONG2_example)

# SNP genotype data
cat("SNP genotype data:\n")
cat("  Samples:", ncol(example_snp$genotype), "\n")
cat("  SNPs:   ", nrow(example_snp$genotype), "\n")
cat("  Assembly:", example_snp$assembly, "\n")

# KIR allele table
cat("\nKIR allele table:\n")
cat("  Locus:  ", example_kir$locus, "\n")
cat("  Alleles:", length(example_kir$allele), "\n")
cat("  Samples:", length(example_kir$value$sample.id), "\n")
head(example_kir$value, 3)

## ----predict------------------------------------------------------------------
# Load the pre-trained example model
model <- hlaModelFromObj(example_mobj)

cat("Model summary:\n")
cat("  KIR locus:   ", model$hla.locus, "\n")
cat("  Classifiers: ", length(example_mobj$classifiers), "\n")
cat("  Alleles:     ", length(model$hla.allele), "\n")

# Predict KIR genotypes on example SNP data
pred <- kirPredict(
  object  = model,
  snp     = example_snp,
  type    = "response+prob",
  verbose = FALSE
)

# View predictions
cat("\nFirst 5 predictions:\n")
head(pred$value, 5)

# Call rate
called    <- sum(!is.na(pred$value$allele1))
call_rate <- round(called / nrow(pred$value) * 100, 1)
cat("\nCall rate:", call_rate, "%\n")

# Clean up model handle
hlaClose(model)

## ----train--------------------------------------------------------------------
# Set up parallel cluster
cl <- parallel::makeCluster(2)

# Train a small KIR3DL1 model using example data
model_trained <- kirParallelAttrBagging(
  cl          = cl,
  hla         = example_kir,
  snp         = example_snp,
  nclassifier = 5,
  verbose     = FALSE
)

parallel::stopCluster(cl)

# View trained model summary
print(model_trained)

# Save model as R object
mobj_trained <- hlaModelToObj(model_trained)
cat("\nModel saved with", length(mobj_trained$classifiers), 
    "classifiers\n")

# Clean up
hlaClose(model_trained)

## ----evaluate-----------------------------------------------------------------
# Split example data into train and test
set.seed(42)
n       <- length(example_kir$value$sample.id)
idx     <- sample(1:n, floor(n * 0.7))
train_samples <- example_kir$value$sample.id[idx]
test_samples  <- example_kir$value$sample.id[-idx]

# Subset KIR and SNP data
train_kir <- hlaAlleleSubset(example_kir,
               match(train_samples, example_kir$value$sample.id))
test_kir  <- hlaAlleleSubset(example_kir,
               match(test_samples, example_kir$value$sample.id))

# Train model on training set
cl <- parallel::makeCluster(2)
model_eval <- kirParallelAttrBagging(
  cl          = cl,
  hla         = train_kir,
  snp         = example_snp,
  nclassifier = 5,
  verbose     = FALSE
)
parallel::stopCluster(cl)

# Subset SNP data to test samples
test_snp_idx <- match(test_samples, example_snp$sample.id)
test_snp     <- hlaGenoSubset(example_snp, samp.sel = test_snp_idx)

# Predict on test set
pred_eval <- kirPredict(
  object  = model_eval,
  snp     = test_snp,
  type    = "response+prob",
  verbose = FALSE
)

# Compare predictions to truth
comp <- hlaCompareAllele(
  TrueHLA        = test_kir,
  PredHLA        = pred_eval,
  call.threshold = 0.5,
  verbose        = FALSE
)

cat("Evaluation results:\n")
cat("  Haplotype accuracy:", 
    round(comp$overall$acc.haplo * 100, 1), "%\n")
cat("  Genotype accuracy: ", 
    round(comp$overall$acc.ind * 100, 1), "%\n")
cat("  Call rate:         ", 
    round(comp$overall$call.rate * 100, 1), "%\n")

# Clean up
hlaClose(model_eval)

## ----cli_example, eval = FALSE------------------------------------------------
# # These commands run from the terminal after adding pong2 to PATH
# system("pong2 impute -i example/chr19 -o output -l KIR3DL1 -a hg19")
# system("pong2 train --bfile example/chr19 --kfile example/kir_call.csv \
#         --output output --locus KIR3DL1 --assembly hg19")

## ----session------------------------------------------------------------------
sessionInfo()

