In this tutorial, we demonstrate how to use Monocle 2 to resolve the complicated haematopoiesis process that contains five branch points from the Paul dataset by applying reversed graph embedding (RGE). The reconstructed developmental trajectory is learned in ten dimensions, selected based on PC variance explained, but can be visualized in two dimensions using a tree layout. We show that the structure of the tree we reconstructed match well with the cell type assignment from the original paper. When we use genes based on the Paul dataset to define stemness or lineage specificity for each cell on the tree, we can see the smooth transition of cell differentation. Multi-way kinetic curves / heatmap visualizes the dynamics of six different cell fate commitment.
This notebook reproduces figures shown in Supplementary Figure 16 of the Monocle 2 paper.
Make sure you install Monocle 2 and other packages correctly in your environment first
rm(list = ls()) # clear the environment
#load all the necessary libraries
options(warn=-1) # turn off warning message globally
suppressMessages(library(monocle))
suppressMessages(library(plyr))
suppressMessages(library(dplyr))
In order to match our analysis with other groups's analysis, in particular those by DPT method from Thesis group, we used the RData from Maren Büttner
# this RData is from Maren Büttner (https://github.com/theislab/scAnalysisTutorial)
load('./Paul_Cell_MARSseq_GSE72857.RData')
# the following code is used to select feature genes used by Maren
gene.names <-sapply(strsplit(rownames(data.debatched), ";"), "[", 1)
is.informative <- gene.names %in% info.genes[order(info.genes)]
data.info.genes <- data.debatched[is.informative,]
rownames(data.info.genes) <- gene.names[is.informative]
################################################################################################################################################
# obtain this mat file from Ido Amit group
MAP_cells_clusters <- read.csv('/Users/xqiu/Downloads/MAP.csv', header = F)
row.names(MAP_cells_clusters) <- MAP_cells_clusters$V1
#filtering cells to include only the ones which were assigned a cluster id:
valid_subset_GSE72857_exprs <- read.table('/Users/xqiu/Downloads/GSE72857_umitab.txt', header = T, row.names = 1)
design_mat <- read.table('/Users/xqiu/Dropbox (Personal)/Projects/DDRTree_fstree/DDRTree_fstree/csv_data/GSE72857_experimental_design.txt', header = T, row.names = 1, skip = 19, sep = '\t')
design_mat$cluster <- MAP_cells_clusters[row.names(design_mat), 'V2']
valid_design_mat <- subset(design_mat, !is.na(cluster))
# Get the intersect gene used by Maren Büttner and the genes we have
common_genes <- rownames(valid_subset_GSE72857_exprs)[rownames(valid_subset_GSE72857_exprs) %in% info.genes]
fd <- new("AnnotatedDataFrame", data = data.frame(gene_short_name = common_genes, row.names = common_genes))
pd <- new("AnnotatedDataFrame", data = valid_design_mat)
# create a CDS with data.info.genes
valid_subset_GSE72857_cds <- newCellDataSet(as(as.matrix(data.info.genes[common_genes, ]), 'sparseMatrix'),
phenoData = pd,
featureData = fd,
lowerDetectionLimit=1,
expressionFamily=negbinomial.size())
valid_subset_GSE72857_cds <- estimateSizeFactors(valid_subset_GSE72857_cds)
valid_subset_GSE72857_cds <- estimateDispersions(valid_subset_GSE72857_cds)
pData(valid_subset_GSE72857_cds)$cell_type <- revalue(as.character(pData(valid_subset_GSE72857_cds)$cluster),
c("1" = 'erythroid', "2" = 'erythroid', "3" = 'erythroid', "4" = 'erythroid', "5" = 'erythroid', "6" = 'erythroid',
"7" = 'CMP', "8" = 'CMP', "9" = 'CMP', "10" = 'CMP',
"11" = 'DC',
"12" = 'GMP', "13" = 'GMP', "14" = 'GMP', "15" = 'GMP', "16" = 'GMP', "17" = 'GMP', "18" = 'GMP',
"19" = 'lymphoid'))
#remove all lymphoid cells as it is not belong to myeloid lineage
valid_subset_GSE72857_cds <- valid_subset_GSE72857_cds[, pData(valid_subset_GSE72857_cds)$cell_type != 'lymphoid']
options(repr.plot.width=4, repr.plot.height=3)
plot_pc_variance_explained(valid_subset_GSE72857_cds) + geom_vline(xintercept = 10)
valid_subset_GSE72857_cds2 <- reduceDimension(valid_subset_GSE72857_cds, norm_method = 'log', verbose = F, max_components = 10)
valid_subset_GSE72857_cds2 <- orderCells(valid_subset_GSE72857_cds2, reverse = T)
Now we can set the root state for the tree layout as state 10, and
detailed_cell_type_color <- c("B" = "#E088B8", "DC" = "#46C7EF", "E" = "#EFAD1E", "Ery" = "#8CB3DF", "M" = "#53C0AD", "MP/EP" = "#4EB859", "GMP" = "#D097C4", "MK" = "#ACC436", "N" = "#F5918A")
pData(valid_subset_GSE72857_cds2)$cell_type2 <- revalue(as.character(pData(valid_subset_GSE72857_cds2)$cluster),
c("1" = 'Ery', "2" = 'Ery', "3" = 'Ery', "4" = 'Ery', "5" = 'Ery', "6" = 'Ery',
"7" = 'MP/EP', "8" = 'MK', "9" = 'GMP', "10" = 'GMP',
"11" = 'DC',
"12" = 'B', "13" = 'B', "14" = 'M', "15" = 'M', "16" = 'N', "17" = 'N', "18" = 'E',
"19" = 'lymphoid'))
# we find a typo where the number labels of states 4 and 1 are swapped in the figure 16A, we will correct this.
options(repr.plot.width=3, repr.plot.height=4)
plot_complex_cell_trajectory(valid_subset_GSE72857_cds2, color_by = 'State', show_branch_points = T,
cell_size = 0.5, cell_link_size = 0.3, root_states = c(10))
options(repr.plot.width=4, repr.plot.height=3)
plot_complex_cell_trajectory(valid_subset_GSE72857_cds2, color_by = 'as.factor(cell_type2)', show_branch_points = T,
cell_size = 0.5, cell_link_size = 0.3, root_states = c(10)) + scale_size(range = c(0.2, 0.2)) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
theme (legend.position="left", legend.title=element_blank()) + scale_color_manual(values = detailed_cell_type_color)
annotation_row = data.frame(`Lineage` = c("1" = 'Ery', "2" = 'Ery', "3" = 'Ery', "4" = 'Ery', "5" = 'Ery', "6" = 'Ery',
"7" = 'MP/EP', "8" = 'MK', "9" = 'GMP', "10" = 'GMP',
"11" = 'DC',
"12" = 'Bas', "13" = 'Bas', "14" = 'M', "15" = 'M', "16" = 'Neu', "17" = 'Neu', "18" = 'Eos',
"19" = 'lymphoid'))
state_cluster_stat <- table(pData(valid_subset_GSE72857_cds2)[, c('State', 'cluster')])
state_cluster_stat <- apply(state_cluster_stat, 2, function(x) x / sum(x))
state_cluster_stat_ordered <- t(state_cluster_stat)
detailed_cell_type_color <- c("Bas" = "#E088B8", "DC" = "#46C7EF", "Eos" = "#EFAD1E", "Ery" = "#8CB3DF", "M" = "#53C0AD", "MP/EP" = "#4EB859", "GMP" = "#D097C4", "MK" = "#ACC436", "Neu" = "#F5918A")
annotation_colors = list(`Lineage` = detailed_cell_type_color)
options(repr.plot.width=4, repr.plot.height=4)
pheatmap::pheatmap(state_cluster_stat_ordered, cluster_cols = F, cluster_rows = F, color = colorRampPalette(RColorBrewer::brewer.pal(n=9, name='Greys'))(10),
annotation_row = annotation_row, annotation_colors = annotation_colors)
The color legend for cell types above is a nice representation but kind hard to distinguish for those who doesn't have gifted eyes. We explicitly label the names for the cell types below and this helps annotating the cell types in Reproduce Figure 16A
annotation_row = data.frame(`Lineage` = c("Ery (1)" = 'Ery', "Ery (2)" = 'Ery', "Ery (3)" = 'Ery', "Ery (4)" = 'Ery', "Ery (5)" = 'Ery', "Ery (6)" = 'Ery',
"MP/EP (7)" = 'MP/EP', "MK (8)" = 'MK', "GMP (9)" = 'GMP', "GMP (10)" = 'GMP',
"DC (11)" = 'DC',
"Bas (12)" = 'Bas', "Bas (13)" = 'Bas', "M (14)" = 'M', "M (15)" = 'M', "Neu (16)" = 'Neu', "Neu (17)" = 'Neu', "Eos (18)" = 'Eos',
"lymphoid (19)" = 'lymphoid'))
state_cluster_stat <- table(pData(valid_subset_GSE72857_cds2)[, c('State', 'cluster')])
state_cluster_stat <- apply(state_cluster_stat, 2, function(x) x / sum(x))
colnames(state_cluster_stat) <- row.names(annotation_row)[1:18]
state_cluster_stat_ordered <- t(state_cluster_stat)
detailed_cell_type_color <- c("Bas" = "#E088B8", "DC" = "#46C7EF", "Eos" = "#EFAD1E", "Ery" = "#8CB3DF", "M" = "#53C0AD", "MP/EP" = "#4EB859", "GMP" = "#D097C4", "MK" = "#ACC436", "Neu" = "#F5918A")
annotation_colors = list(`Lineage` = detailed_cell_type_color)
options(repr.plot.width=5, repr.plot.height=4)
pheatmap::pheatmap(state_cluster_stat_ordered, cluster_cols = F, cluster_rows = F, color = colorRampPalette(RColorBrewer::brewer.pal(n=9, name='Greys'))(10),
annotation_row = annotation_row, annotation_colors = annotation_colors)
In the following, we first load the genes (based on an orthogonal Olsson dataset) to define stemness or lineage score.
# load in the genes used to define stemness and genes used to define lineage score:
load('gene_set') #including three variables: all_down_valid, positive_score_genes, negtive_score_genes
cell_ery_meg_lineage_score <- esApply(valid_subset_GSE72857_cds2[c(positive_score_genes, negtive_score_genes), ], 2, function(x) mean(x[1:length(positive_score_genes)]) - mean(x[length(positive_score_genes):length(x)]))
pData(valid_subset_GSE72857_cds2)$ery_meg_lineage_score <- -cell_ery_meg_lineage_score
options(repr.plot.width=3, repr.plot.height=4)
plot_complex_cell_trajectory(valid_subset_GSE72857_cds2, color_by = 'ery_meg_lineage_score', show_branch_points = T, cell_link_size = 0.3, cell_size = 0.0001, root_states = 10) +
scale_colour_gradient2() + scale_size(range = c(0.2, 0.2)) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) + theme (legend.position="right", legend.title=element_blank()) +
theme(legend.position="top", legend.title=element_blank())
#normalize UMI to TPM:
tpm_exprs_valid_subset_GSE72857_cds2 <- esApply(valid_subset_GSE72857_cds2, 2, function(x) x / sum(x) * 1e6)
cell_stemness_score <- apply(tpm_exprs_valid_subset_GSE72857_cds2[all_down_valid, ], 2, function(x) mean(x))
# cell_stemness_score <- esApply(valid_subset_GSE72857_cds2[all_down_valid, ], 2, function(x) mean(x))
pData(valid_subset_GSE72857_cds2)$cell_stemness_score <- log2(cell_stemness_score)
pData(valid_subset_GSE72857_cds2)$no_expression <- cell_stemness_score == 0
options(repr.plot.width=5, repr.plot.height=4)
plot_complex_cell_trajectory(valid_subset_GSE72857_cds2, color_by = 'cell_stemness_score', show_branch_points = T, cell_link_size = 0.3, cell_size = 0.0001, root_states = 10) +
scale_colour_gradientn(colours = terrain.colors(10)) + facet_wrap(~no_expression) + scale_size(range = c(0.2, 0.2)) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) + theme (legend.position="right", legend.title=element_blank()) +
theme(legend.position="top", legend.title=element_blank())
# ensure pseudotime starts from cell state 10
which(pData(valid_subset_GSE72857_cds2)$Pseudotime == 0) # this should be 449
marseq_complex_tree_ery_meg_lineage_score_kinetic_curves_cols <- c("Neu" = "#6BBE44", "DC" = "#31C5F4", "Baso" = "#D493C0", "Mono" = "#F58F89", "MK" = "#4CBD8E", "Ery" = "#BCC132")
valid_subset_GSE72857_cds2 <- orderCells(valid_subset_GSE72857_cds2, root_state = 10)
options(repr.plot.width=8, repr.plot.height=2)
plot_multiple_branches_pseudotime(valid_subset_GSE72857_cds2[c('Car1', 'Elane', 'Car2', 'Prtn3'),],
branches=c(1, 3, 4, 6, 11, 9),
branches_name=c("Neu", "DC", "Baso", "Mono", "MK", "Ery"), color_by = 'Branch',
nrow = 1, ncol = 4) + scale_color_manual(values = marseq_complex_tree_ery_meg_lineage_score_kinetic_curves_cols)
options(repr.plot.width=8, repr.plot.height=12)
plot_multiple_branches_heatmap(valid_subset_GSE72857_cds2[c(positive_score_genes, negtive_score_genes),],
branches=c(1, 3, 4, 6, 11, 9),
branches_name=c("Neu", "DC", "Baso", "Mono", "MK", "Ery"),
show_rownames=T,
num_clusters=4)
sessionInfo()