## Loading required package: Umpire
In this vignette, we ilustrate how to apply the GenAlgo
package
to the problem of feature selection in an “omics-scale” data set. We
start by loading the packages that we will need.
library(GenAlgo)
library(Umpire)
library(oompaBase)
We will use the Umpire
package to simulate a comnplex enough data set
to stress our feature selection algorithm. We begin by setting up a
time-to-event model, built on an exponential baseline.
set.seed(391629)
sm <- SurvivalModel(baseHazard=1/5, accrual=5, followUp=1)
Next, we build a “cancer model” with six subtypes.
nBlocks <- 20 # number of possible hits
cm <- CancerModel(name="cansim",
nPossible=nBlocks,
nPattern=6,
OUT = function(n) rnorm(n, 0, 1),
SURV= function(n) rnorm(n, 0, 1),
survivalModel=sm)
### Include 100 blocks/pathways that are not hit by cancer
nTotalBlocks <- nBlocks + 100
Now define the hyperparameters for the models.
### block size
blockSize <- round(rnorm(nTotalBlocks, 100, 30))
### log normal mean hypers
mu0 <- 6
sigma0 <- 1.5
### log normal sigma hypers
rate <- 28.11
shape <- 44.25
### block corr
p <- 0.6
w <- 5
Now set up the baseline “Engine”.
rho <- rbeta(nTotalBlocks, p*w, (1-p)*w)
base <- lapply(1:nTotalBlocks,
function(i) {
bs <- blockSize[i]
co <- matrix(rho[i], nrow=bs, ncol=bs)
diag(co) <- 1
mu <- rnorm(bs, mu0, sigma0)
sigma <- matrix(1/rgamma(bs, rate=rate, shape=shape), nrow=1)
covo <- co *(t(sigma) %*% sigma)
MVN(mu, covo)
})
eng <- Engine(base)
We alter the means if there is a hit, or else build it using the original engine components.
altered <- alterMean(eng, normalOffset, delta=0, sigma=1)
object <- CancerEngine(cm, eng, altered)
summary(object)
## A 'CancerEngine' using the cancer model:
## --------------
## cansim , a CancerModel object constructed via the function call:
## CancerModel(name = "cansim", nPossible = nBlocks, nPattern = 6, SURV = function(n) rnorm(n, 0, 1), OUT = function(n) rnorm(n, 0, 1), survivalModel = sm)
##
## Pattern prevalences:
## [1] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
##
## Survival effects:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.13455 -0.06092 0.30563 0.27837 0.73876 2.51446
##
## Outcome effects:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.335728 -0.590101 0.009406 -0.104634 0.143112 1.454560
## --------------
##
## Base expression given by:
## An Engine with 120 components.
##
## Altered expression given by:
## An Engine with 120 components.
rm(altered, base, blockSize, cm, eng, mu0, nBlocks, nTotalBlocks,
p, rate, rho, shape, sigma0, sm, w)
Now we can use this elaborate setup to generate the simulated data.
train <- rand(object, 198)
tdata <- train$data
pid <- paste("PID", sample(1001:9999, 198+93), sep='')
rownames(train$clinical) <- colnames(tdata) <- pid[1:198]
Of course, to make things harder, we will add noise to the simulated measurements.
noise <- NoiseModel(3, 1, 1e-16)
train$data <- log2(blur(noise, 2^(tdata)))
sum(is.na(train$data))
## [1] 0
rm(tdata)
summary(train$clinical)
## CancerSubType Outcome LFU Event
## Min. :1.000 Bad :106 Min. : 0.00 Mode :logical
## 1st Qu.:2.000 Good: 92 1st Qu.: 3.00 FALSE:88
## Median :4.000 Median :19.00 TRUE :110
## Mean :3.662 Mean :22.32
## 3rd Qu.:5.000 3rd Qu.:36.75
## Max. :6.000 Max. :67.00
summary(train$data[, 1:3])
## PID7842 PID6153 PID2085
## Min. : 1.410 Min. : 1.337 Min. : 1.724
## 1st Qu.: 5.021 1st Qu.: 5.008 1st Qu.: 5.071
## Median : 6.084 Median : 6.048 Median : 6.117
## Mean : 6.127 Mean : 6.099 Mean : 6.163
## 3rd Qu.: 7.172 3rd Qu.: 7.122 3rd Qu.: 7.192
## Max. :11.705 Max. :12.687 Max. :12.170
Now we can also simualte a validation data set.
valid <- rand(object, 93)
vdata <- valid$data
vdata <- log2(blur(noise, 2^(vdata))) # add noise
sum(is.na(vdata))
## [1] 0
vdata[is.na(vdata)] <- 0.26347
valid$data <- vdata
colnames(valid$data) <- rownames(valid$clinical) <- pid[199:291]
rm(vdata, noise, object, pid)
summary(valid$clinical)
## CancerSubType Outcome LFU Event
## Min. :1.000 Bad :54 Min. : 0.00 Mode :logical
## 1st Qu.:2.000 Good:39 1st Qu.: 3.00 FALSE:28
## Median :3.000 Median :15.00 TRUE :65
## Mean :3.172 Mean :22.32
## 3rd Qu.:5.000 3rd Qu.:35.00
## Max. :6.000 Max. :71.00
summary(valid$data[, 1:3])
## PID6584 PID5256 PID2944
## Min. : 1.287 Min. : 1.317 Min. : 1.760
## 1st Qu.: 4.957 1st Qu.: 4.997 1st Qu.: 5.022
## Median : 5.995 Median : 6.030 Median : 6.064
## Mean : 6.067 Mean : 6.083 Mean : 6.121
## 3rd Qu.: 7.109 3rd Qu.: 7.116 3rd Qu.: 7.151
## Max. :12.205 Max. :12.223 Max. :12.341
Now we can start using the GenAlgo
package. The key step is to define
sensible functions that can measure the “fitness” of a solution and to
introduce “mutations”. When these functions are called, they are passed a
context
argument that can be used to access extra information about
how to proceed. In this case, that context will be the train
object,
which includes the clinical information about the samples.
Now we can define the fitness function. The idea is to compute the Mahalanobis distance between the two groups (of “Good” or “Bad” outcome samples) in the space defined by the selected features.
measureFitness <- function(arow, context) {
predictors <- t(context$data[arow, ]) # space defined by features
groups <- context$clinical$Outcome # good or bad outcome
maha(predictors, groups, method='var')
}
The mutation function randomly chooses any other feature/row to swap out possible predictors of the outcome.
mutator <- function(allele, context) {
sample(1:nrow(context$data),1)
}
We need to decide how many features to include in a potential predictor (here we use ten). We also need to decide how big a population of feature-sets (here we use 200) should be used in each generation of the genetic algorithm.
set.seed(821831)
n.individuals <- 200
n.features <- 10
y <- matrix(0, n.individuals, n.features)
for (i in 1:n.individuals) {
y[i,] <- sample(1:nrow(train$data), n.features)
}
Having chosen the staring population, we can run the first step of the genetic algorithm.
my.ga <- GenAlg(y, measureFitness, mutator, context=train) # initialize
summary(my.ga)
## An object representing generation 1 in a genetic algorithm.
## Population size: 200
## Mutation probability: 0.001
## Crossover probability: 0.5
## Fitness distribution:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02137 0.14909 0.22210 0.24071 0.31384 0.58730
To be able to evaluate things later, we save the starting generation
recurse <- my.ga
pop0 <- sort(table(as.vector(my.ga@data)))
Realistically, we probably want to run a couple of hundred or even a couple thousand iterations of the algorithm. But, in interests of making the vignette complete ins a reasonable amount of time, we are only going to terate through 20 generations.
NGEN <- 20
diversity <- meanfit <- fitter <- rep(NA, NGEN)
for (i in 1:NGEN) {
recurse <- newGeneration(recurse)
fitter[i] <- recurse@best.fit
meanfit[i] <- mean(recurse@fitness)
diversity[i] <- popDiversity(recurse)
}
Plot max and mean fitness by generation. This figure shows that both the mean and the maximum fitness are increasing.
plot(fitter, type='l', ylim=c(0, 1.5), xlab="Generation", ylab="Fitness")
abline(h=max(fitter), col='gray', lty=2)
lines(fitter)
lines(meanfit, col='gray')
points(meanfit, pch=16, col=jetColors(NGEN))
legend("bottomleft", c("Maximum", "Mean"), col=c("black", "blue"), lwd=2)
Plot the diversity of the population, to see that it is deceasing.
plot(diversity, col='gray', type='l', ylim=c(0,10), xlab='', ylab='', yaxt='n')
points(diversity, pch=16, col=jetColors(NGEN))
See which predictors get selected most frequently in the latest generation.
sort(table(as.vector(recurse@data)))
##
## 1137 2839 3318 5599 9337 2852 3439 4248 5851 7642 7720 9039
## 1 1 1 1 1 2 2 2 2 2 2 2
## 9771 11427 49 918 1599 2390 5344 6130 6333 8322 8725 8807
## 2 2 3 3 3 3 3 3 3 3 3 3
## 9150 9512 10186 10884 11139 1958 4632 4991 5094 6601 9728 11091
## 3 3 3 3 3 4 4 4 4 4 4 4
## 12304 93 456 3801 3857 6386 6657 7680 8741 10825 11512 4100
## 4 5 5 5 5 5 5 5 5 5 5 6
## 9044 12161 2845 6022 9923 7846 5152 5473 9574 81 10220 10965
## 6 6 7 7 7 8 9 9 9 11 11 11
## 2335 3055 9565 10435 1176 6440 8749 449 10765 1534 6959 7306
## 12 12 12 12 14 15 16 17 17 18 18 19
## 11164 7000 8651 3838 8973 9524 5086 8832 11464 8683 1451 4976
## 20 21 21 22 23 24 27 27 28 29 32 34
## 4953 6568 8926 7348 1394 9874 10247 171 6117 8600 1855 1935
## 38 44 49 61 62 65 72 84 88 101 120 152
## 936 6252
## 158 159