## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  tidy.opts = list(width.cutoff = 100), tidy = TRUE
)

## ----setup--------------------------------------------------------------------
library(hsrecombi) 
library(AlphaSimR)
library(doParallel)
library(ggplot2)
library(rlist)
Sys.time()

## ----global-------------------------------------------------------------------
# number of chromosomes
nchr <- 2
# Number of simulated SNPs (e.g., p = 1500 resembles an average bovine chromosome)
p <- 150
# Number of simulated progeny in each half-sib family 
n <- 1000
# Directory for (simulated) data
path.dat <- 'data'
dir.create(path.dat, showWarnings = FALSE)
# Directory for output
path.res <- 'results'
dir.create(path.res, showWarnings = FALSE)
# Number of computing clusters allocated
nclust <- 2

## ----alphasim-----------------------------------------------------------------
founderPop <- runMacs2(nInd = 1000, nChr = nchr, segSites = p)
SP <- SimParam$new(founderPop)
SP$setSexes("yes_sys")
# Enable tracing location of recombination events
SP$setTrackRec(TRUE)  

pop <- newPop(founderPop)
N <- 10; ntotal <- N * n
my_pop <- selectCross(pop = pop, nFemale = 500, nMale = N, use = "rand", nCrosses = ntotal)

probRec <- list()
for(chr in 1:nchr){
  co.pat <- matrix(0, ncol = p, nrow = ntotal)
  for(i in 1:ntotal){
    if(nrow(SP$recHist[[1000 + i]][[chr]][[2]]) > 1){
      # 1. line contains 1 1 by default
      loci <- SP$recHist[[1000 + i]][[chr]][[2]][-1, 2] 
      co.pat[i, loci] <- 1 
    }
  }
  probRec[[chr]] <- colMeans(co.pat)
}

save(list = c('SP', 'founderPop', 'pop', 'my_pop', 'ntotal', 'probRec'), file = 'data/pop.Rdata')

## ----genetic-data-------------------------------------------------------------
PAT <- my_pop@father
rown <- paste(rep(unique(PAT), each = 2), c(1, 2), sep = '_')
H.pat <- pullSegSiteHaplo(pop)[rown, ]
X <- pullSegSiteGeno(my_pop)
# Physical position of markers in Mbp
map.snp <- lapply(founderPop@genMap, function(z) z * 100)

## ----plink-format-------------------------------------------------------------
map <- data.frame(Chr = rep(1:length(map.snp), unlist(lapply(map.snp, length))), 
                  Name = paste0('SNP', 1:length(unlist(map.snp))),
                  locus_Mb = unlist(map.snp), 
                  locus_bp = unlist(map.snp) * 1e+6)

colnames(X) <- map$Name
FID <- 'FAM001'
IID <- my_pop@id
MAT <- my_pop@mother
SEX <- 2
PHENOTYPE <- -9

for(chr in 1:nchr){
  write.table(map[map$Chr == chr, ], file.path(path.dat, paste0('map_chr', chr, '.map')), 
              col.names = F, row.names = F, quote = F)
  write.table(cbind(FID, IID, PAT, MAT, SEX, PHENOTYPE, X[, map$Chr == chr]), 
              file.path(path.dat, paste0('hsphase_input_chr', chr, '.raw')), col.names = T, row.names = F, quote = F) 
}

## ----parallel-computing-------------------------------------------------------
cl <- makeCluster(nclust)
registerDoParallel(cl)

## ----recombination-rate-------------------------------------------------------
out <- foreach(chr = 1:nchr, .packages = 'hsrecombi') %dopar% {
  
  # 1: Physical  map
  map <- read.table(file.path(path.dat, paste0('map_chr', chr, '.map')), col.names = c('Chr', 'SNP', 'locus_Mb', 'locus_bp'))
 
  # 2: Genotype matrix
  genomatrix <- data.table::fread(file.path(path.dat, paste0('hsphase_input_chr', chr, '.raw')))
  X <- as.matrix(genomatrix[, -c(1:6)])
  X[is.na(X)] <- 9 # required for hsphase
  
  # 3: Assign daughters to sire IDs
  daughterSire <- genomatrix$PAT
  
  # 4: Estimate sire haplotypes and format data 
  hap <- makehappm(unique(daughterSire), daughterSire, X)
  save('hap', file = file.path(path.res, paste0('hsphase_output_chr', chr, '.Rdata')))
  
  # Check order and dimension
  io <- sapply(1:nrow(map), function(z){grepl(x = colnames(X)[z], pattern = map$SNP[z])})
  if(sum(io) != nrow(map)) stop("ERROR in dimension")
  
  # 5: Estimate recombination rates
  res <- hsrecombi(hap, X)
  final <- editraw(res, map)
  save(list = c('final', 'map'), file = file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
  
  ifelse(nrow(final) > 0, 'OK', 'no result')
}
print(which(unlist(out) == 'OK'))

## ----misplaced----------------------------------------------------------------
# 6a: Filter SNPs with unusually large recombination rate to neighbouring (30) SNPs
excl <- foreach(chr = 1:nchr, .packages = 'hsrecombi') %dopar% {
  load(file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
  checkCandidates(final, map)
}

# 6b: Heatmap plot of recombination rates for visual verification, e.g.:
chr <- 2
load(file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
cand <- excl[[chr]][1]
win <- match(cand, map$SNP) + (-100:100)
win <- win[(win >= 1) & (win <= nrow(map))]

target <- final[(final$SNP1 %in% map$SNP[win]) & (final$SNP2 %in% map$SNP[win]), ]
target$SNP1 <- match(target$SNP1, map$SNP)
target$SNP2 <- match(target$SNP2, map$SNP)

ggplot(data = target, aes(SNP2, SNP1, fill = theta)) + 
  geom_tile() +
  xlab("Locus 2") +
  ylab("Locus 1") +
  coord_equal() + 
  scale_y_continuous(trans = "reverse") +
  theme(panel.background = element_blank(), 
        panel.grid.major = element_line(colour = "grey", linewidth = 0.1),
        panel.grid.minor = element_line(colour = "grey")) +
  theme(text = element_text(size = 18)) +
  scale_fill_gradientn(colours = c('yellow', 'red'), limits = c(0, 1+1e-10), na.value = 'white')

# -> nothing conspicious
excl[[1]] <- excl[[2]] <- NA

## ----genetic-position---------------------------------------------------------
pos <- foreach(chr = 1:nchr, .packages = 'hsrecombi') %dopar% {
  load(file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
  geneticPosition(final, map, exclude = excl[[chr]])
}

## ----stop-parallel------------------------------------------------------------
stopCluster(cl)

## ----plot-1-------------------------------------------------------------------
data <- data.frame(Chr = rep(1:length(pos), times = unlist(list.select(pos, length(pos.Mb)))), 
                   Mbp = unlist(list.select(pos, pos.Mb)), cM = unlist(list.select(pos, pos.cM)))
ggplot(data, aes(x = Mbp, y = cM)) + geom_point(na.rm = T) + facet_grid(Chr ~ .)

## ----plot-2-------------------------------------------------------------------
for (chr in 1:nchr) {
  load(file.path(path.res, paste0("Results_chr", chr, ".Rdata")))
  final$theta[(final$SNP1 %in% excl[[chr]]) | (final$SNP2 %in% excl[[chr]])] <- NA
  final$dist_M <- (pos[[chr]]$pos.cM[final$SNP2] - pos[[chr]]$pos.cM[final$SNP1]) / 100
  out <- bestmapfun(theta = final$theta, dist_M = final$dist_M)
  plot(final$dist_M, final$theta, xlab = 'genetic distance (Morgan)', ylab = 'recombination rate', col = 8)
  x <- seq(0, 0.5, 0.01); y <- rao(out$mixing, x); points(y, x, type = 'l', lwd = 2)
  legend('bottomright', legend = paste0('map function (p=', round(out$mixing, 3), ')'), lwd = 2, lty = 1, bty = 'n')
}

## ----selection----------------------------------------------------------------
chr <- 2

## ----check-simulated----------------------------------------------------------
load(file.path(path.dat, 'pop.Rdata'))
sim.cM <- cumsum(probRec[[chr]]) * 100

## ----check-deterministic------------------------------------------------------
load(file.path(path.res, paste0('hsphase_output_chr', chr, '.Rdata')))
hsphase.cM <- hap$gen

## ----plot-3-------------------------------------------------------------------
par(mar=c(5.1, 4.1, 4.1, 9.1), xpd = TRUE)
plot(pos[[chr]]$pos.Mb, pos[[chr]]$pos.cM, xlab = 'physical position (Mbp)', ylab = 'genetic position (cM)', 
     ylim = range(c(sim.cM, hsphase.cM, pos[[chr]]$pos.cM), na.rm = T), pch = 20)
points(pos[[chr]]$pos.Mb, sim.cM, pch = 20, col = 4)
points(pos[[chr]]$pos.Mb, hsphase.cM, pch = 20, col = 8)
legend('topleft', inset=c(1.01,0), legend = c('simulated', 'likelihood-based', 'deterministic'), pch = 20, col = c(4, 1, 8), bty = 'n')

## ----cleanup------------------------------------------------------------------
unlink(path.dat, recursive = TRUE)
unlink(path.res, recursive = TRUE)
Sys.time()

