RNA-Seq extended example

I will use the airway dataset from Bioconductor. In this data, the rows are genes, and columns are measurements of the amount of RNA in different biological samples. The data examines the effect of dexamethasone treatment on four different airway muscle cell lines.

Data normalization and filtering

I start with the usual mucking around for an RNA-Seq dataset to normalize and log transform the data, and get friendly gene names.

I also filter out genes with low variability (this also eliminates genes with low expression). This is mostly to keep the number of points manageable in langevitour.

library(langevitour)
library(airway)               # airway dataset
library(edgeR)                # RPM calculation
library(limma)                # makeContrasts
library(MASS)                 # ginv generalized matrix inverse
library(GPArotation)          # Bentler rotation
library(EnsDb.Hsapiens.v86)   # Gene names
library(ggplot2)
library(dplyr)
library(tibble)

data(airway)

treatment <- colData(airway)$dex == "trt"
cell <- factor(c(1,1,2,2,3,3,4,4))
design <- model.matrix(~ 0 + cell + treatment)

dge <- airway |>
    assay("counts") |>
    DGEList() |>
    calcNormFactors()

# Convert to log2 Reads Per Million.
# prior.count=5 applies some moderation for counts near zero.
rpms <- cpm(dge, log=TRUE, prior.count=5)

# Only show variable genes (mostly for speed)
keep <- apply(rpms,1,sd) >= 0.5
table(keep)
#> keep
#> FALSE  TRUE 
#> 60057  4045
y <- rpms[keep,,drop=F]

# Use shorter sample names
colnames(y) <- paste0(ifelse(treatment,"T","U"), cell)

# Get friendly gene names
symbols <- 
    AnnotationDbi::select(EnsDb.Hsapiens.v86, keys=rownames(y), keytype="GENEID", columns="SYMBOL") |>
    deframe()
name <- symbols[rownames(y)]
name[is.na(name)] <- rownames(y)[is.na(name)]

# Colors for the samples
colors <- ifelse(treatment,"#f00","#080")

These are the first few rows of our normalized and log transformed data.

U = Untreated, T = Treated, 1 2 3 4 = cell line.

y[1:5,]
#>                    U1     T1    U2     T2     U3     T3    U4    T4
#> ENSG00000001561  1.80  1.517 3.120  3.468  0.196  0.580  2.81  3.12
#> ENSG00000002745 -1.28 -0.895 0.151 -0.116 -1.701 -1.937 -1.05 -1.41
#> ENSG00000002933  5.85  6.288 4.686  4.915  0.293  0.174  1.92  2.48
#> ENSG00000003096  4.67  4.044 4.497  3.598  4.387  3.278  4.63  3.49
#> ENSG00000003137  2.57  1.666 3.392  2.203  0.510  0.790  3.33  1.84

Contrasts of interest

I now work out how to estimate a linear model and calculate contrasts of interest. The contrasts will be made available as extra axes in the langevitour plot.

coefficient_estimator <- MASS::ginv(design)

contrasts <- makeContrasts(
        average=(cell1+cell2+cell3+cell4)/4+treatmentTRUE/2,
        treatment=treatmentTRUE,
        "cell1 vs others" = cell1-(cell2+cell3+cell4)/3,
        "cell2 vs others" = cell2-(cell1+cell3+cell4)/3,
        "cell3 vs others" = cell3-(cell1+cell2+cell4)/3,
        "cell4 vs others" = cell4-(cell1+cell2+cell3)/3,
        levels=design)

contrastAxes <- t(coefficient_estimator) %*% contrasts

contrastAxes
#>       Contrasts
#>        average treatment cell1 vs others cell2 vs others cell3 vs others
#>   [1,]   0.125     -0.25           0.500          -0.167          -0.167
#>   [2,]   0.125      0.25           0.500          -0.167          -0.167
#>   [3,]   0.125     -0.25          -0.167           0.500          -0.167
#>   [4,]   0.125      0.25          -0.167           0.500          -0.167
#>   [5,]   0.125     -0.25          -0.167          -0.167           0.500
#>   [6,]   0.125      0.25          -0.167          -0.167           0.500
#>   [7,]   0.125     -0.25          -0.167          -0.167          -0.167
#>   [8,]   0.125      0.25          -0.167          -0.167          -0.167
#>       Contrasts
#>        cell4 vs others
#>   [1,]          -0.167
#>   [2,]          -0.167
#>   [3,]          -0.167
#>   [4,]          -0.167
#>   [5,]          -0.167
#>   [6,]          -0.167
#>   [7,]           0.500
#>   [8,]           0.500

Principal Components

I will also supply Principal Components as extra axes in the plot. When doing the PCA, I don’t use scaling because the data is already all in comparable units of measurement (log2 RPM). From the scree plot, four components seems reasonable. I also supply the Bentler rotation of these axes, which may be more easily interpretable.

y_centered <- sweep(y, 1, rowMeans(y), "-")

pca <- prcomp(y_centered, scale=FALSE, rank=4)
plot(pca)

pcaAxes <- pca$rotation

bentlerAxes <- pca$rotation %*% bentlerT(pca$x)$Th
colnames(bentlerAxes) <- paste0("Bentler",seq_len(ncol(bentlerAxes)))

Plot the data with langevitour

We are now ready to do the langevitour plot.

A key tool we will use is deactivating an axis by unchecking its checkbox. Langevitour will then only show projections of the data that are orthogonal to the deactivated axis.

Things to try:

Notes:

langevitour(
    y, scale=15, axisColor=colors, name=name, 
    extraAxes=cbind(contrastAxes, pcaAxes, bentlerAxes))