RNA-seq analysis workflow
Preparation
Sample data
Data import (count matrix).
Data import (TPM).
TPM calculation
Calculate TPM sample by sample
Principal component analysis
ggplot(pcaData, aes(x = PC1, y = PC2, fill = group)) +
geom_point(size = 4, alpha = 0.6, shape = 21, color = "black", stroke = 0.5) +
ggrepel::geom_text_repel(aes(label=name),
color="grey6", size=3, hjust= -0.3, vjust=-0.3) +
labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
theme_bw() +
theme(legend.title = element_blank()) +
ggtitle("PCA") +
labs(caption = " ")
Differentially Expressed Genes (DEG) analysis
DEG data
Visualization of DEGs Volcanoplot
res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & padj < pval, 'UP',
ifelse(log2FoldChange <= -log2(fc) & padj < pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
res %>%
ggplot(aes(log2FoldChange, -log10(padj), color=DE)) +
geom_point(size=1, alpha=0.5) +
scale_color_manual(values = c('red','blue','grey')) +
theme_classic() +
geom_vline(xintercept = c(-log2(fc),log2(fc)), color='grey') +
geom_hline(yintercept = -log10(0.05),color='grey') +
guides(colour = guide_legend(override.aes = list(size=5))) +
ggtitle("Resist/Naive") +
ggeasy::easy_center_title() ## to center title
Gene set enrichment analysis
Geneset
Perform GSEA
library(clusterProfiler)
perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
ranking <- function(res) {
# Check the name of log2fc related
if ("avg_log2FC" %in% names(res)) {
df <- res$avg_log2FC
} else if ("log2FoldChange" %in% names(res)) {
df <- res$log2FoldChange
} else {
stop("Neither avg_log2FC nor log2FoldChange columns found in the data frame.")
}
names(df) <- rownames(res)
df <- sort(df, decreasing = TRUE)
return(df)
}
ranked.res <- ranking(res)
x <- clusterProfiler::GSEA(geneList = ranked.res,
TERM2GENE = ref,
pvalueCutoff = pvalueCutoff,
pAdjustMethod = "BH",
verbose = TRUE,
seed = TRUE)
result <- x@result %>% arrange(desc(NES))
result <- result[, c('NES', 'pvalue', 'p.adjust', 'core_enrichment', 'ID')]
return(result)
}
# Application
gsea.res1 = perform_GSEA(res = res, ref = hallmark, pvalueCutoff = 1)
NES plot
# GESA Plot
gsea_nes_plot <- function(gsea.res, title, color="p.adjust") {
gsea.res = gsea.res %>% mutate(sig=ifelse(p.adjust <= 0.05, "FDR <= 0.05", "FDX >= 0.05"))
# basic plot
p <- gsea.res %>%
ggplot(aes(reorder(ID, NES), NES)) +
geom_col(aes(fill=!!sym(color))) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score", title="GSEA") +
theme_classic() +
theme(axis.text.x = element_text(size=5, face = 'bold'),
axis.text.y = element_text(size=6, face = 'bold'),
axis.title = element_text(size=10)) +
ggtitle(title)
# color by color input type
if (color == "p.adjust") {
p <- p + scale_fill_gradient(low = 'red', high = '#E5E7E9')
} else if (color == "sig") {
p <- p + scale_fill_manual(values = c("red", "#E5E7E9"))
}
return(p)
}
# Application
gsea_nes_plot(gsea.res = gsea.res1, title = "Test1", color = "p.adjust") # color by p.adjust
gsea_nes_plot(gsea.res = gsea.res1, title = "Test2", color = "sig") # color by significance
Enrichment plot for the individual pathway
# enrichplot::gseaplot2 function
perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
ranking <- function(res) {
# Check the name of log2fc related
if ("avg_log2FC" %in% names(res)) {
df <- res$avg_log2FC
} else if ("log2FoldChange" %in% names(res)) {
df <- res$log2FoldChange
} else {
stop("Neither avg_log2FC nor log2FoldChange columns found in the data frame.")
}
names(df) <- rownames(res)
df <- sort(df, decreasing = TRUE)
return(df)
}
ranked.res <- ranking(res)
x <- clusterProfiler::GSEA(geneList = ranked.res,
TERM2GENE = ref,
pvalueCutoff = pvalueCutoff,
pAdjustMethod = "BH",
verbose = TRUE,
seed = TRUE)
return(x)
}
gsea.res = perform_GSEA(res = res, ref = hallmark, pvalueCutoff = 1)
# Significant pathways
ids = gsea.res@result %>% filter(p.adjust <= 0.05) %>% rownames()
ssGSEA
Single sample GSEA
Single-sample Gene Set Enrichment Analysis (ssGSEA) is an variation
of the GSEA algorithm that instead of calculating enrichment scores for
groups of samples (i.e Control vs Disease) and sets of genes (i.e
pathways), it provides a score for each each sample and gene set
pair.
(https://www.genepattern.org/modules/docs/ssGSEAProjection/4#gsc.tab=0)
## Perform ssgsea
library(corto)
## Input data : tpm (count.mtx is accepted as well)
test = ssgsea(tpm,hallmarkList)
## Reshape it to plot
test.df = test %>% t() %>% data.frame()
rownames(test.df) = colnames(tpm)
## p value of ssgsea
pval = corto::z2p(test)
colnames(pval) = rownames(test.df)
ssGSEA heatmap
Color represents enrichment score
Significance by p value Heatmap
Red: p value <= 0.05 (significant)
White : p value > 0.05 (not significant)
Single pathway ssGSEA
Pathway sample 1
# Plot it
genesetname = "WP Apoptosis"
test.df = test.df %>%
mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
colorRampPalette(colors = c("white","#D35400"))(40))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F,
color=my.color, border_color = "grey33",
gaps_col = 1,
main = "ssGSEA with significance")
Pathway sample 2
# Plot it
genesetname = "GPBP DNA Damage"
test.df = test.df %>%
mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
colorRampPalette(colors = c("white","#D35400"))(60))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F,
color=my.color, border_color = "grey33",
gaps_col = 1,
main = "ssGSEA with significance")
Multiple genes-violin plot across samples
# Import sample data
tpm.sample = read.csv("../sampledata/tpm.sample.csv", row.names = 1)
cellmarker = read.csv('../info/DB/CellMarker2.0.csv')
celltypes = c("B cell",
"CD8+ T cell",
"Activated T cell",
"Cancer cell",
"CD4+ T cell",
"Dendritic cell",
"Macrophage")
# Functions
mks.violinplot = function(cellname, tpm=tpm.sample){
rs <- grepl(cellname, cellmarker$cell_name, ignore.case = TRUE)
mks = cellmarker[rs, ]$marker
mks = mks[mks %in% rownames(tpm)]
tpm.df =tpm %>% filter(rownames(.) %in% mks)
tpm.df$gene = rownames(tpm.df)
p= tpm.df %>% reshape::melt() %>% ggplot(aes(.[,2], log10(.[,3]))) +
geom_violin(aes(fill = variable), alpha = 0.1, show.legend = FALSE) +
geom_boxplot(outlier.size = 0, width = 0.2, alpha = 0.5, show.legend = FALSE) +
geom_jitter(aes(color = variable), alpha = 0.5, size = 1, show.legend = FALSE) +
scale_y_continuous(labels = scales::comma) +
theme_bw() +
labs(y = paste0(cellname, " genes (log10 scaled)"),
title = paste0(cellname, " signature genes ")) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust=1, size=14),
panel.grid.major.x = element_blank(),
axis.title.y = element_text(size = 14), # y 축 레이블 폰트 크기 설정
plot.title = element_text(size = 20)
)
return(p)
}
DEG analysis sample analysis
DEG CND1 vs CTRL
- DEG information table
- Volcanoplot
- DEG output.
DEG CND2 vs CTRL
- DEG information table
- Volcanoplot
- DEG output.
DEG CND2 vs CND1
- DEG information table
- Volcanoplot
- DEG output.
Select the signicant DEGs
From three comparisons
- DEG1
- DEG2
- DEG3
Kmeans clustering
Significant genes X samples matrix was used for kmeans clustering to
identify any patterns across data.
Clustering method: Kmeans
Adjustable parameter : k (number of clusters)
tpm = read.csv("../sampledata/RNAseq_sample_tpm.csv", row.names = 1)
tpm = tpm[significant_genes_all,]
############## kmeans clustering #####################
## zscore function
zscore <- function(input.data = input.data){
input.data.rowsums <- rowSums(input.data)
input.data.mean <- rowMeans(input.data)
input.data.sd <- matrixStats::rowSds(as.matrix(input.data))
names(input.data.sd) <- rownames(input.data)
zscore <- (input.data-input.data.mean)/input.data.sd
return(zscore)
}
# Create input data with zscaled
input.data <- zscore(tpm[significant_genes_all,])
input.data <- na.omit(input.data)
# Perform kmeans clustering
kmeans_clustering = function(k=3, input.data= input.data){
set.seed(1234)
fitout <- kmeans(input.data, centers = k, nstart = k)
fitout.cluster <- fitout$cluster
# table(fitout.cluster)
df = fitout.cluster %>% as.data.frame()
colnames(df) = "cluster"
rownames(df) = names(fitout.cluster)
df = df %>% arrange(cluster)
df$cluster = paste0("Cluster", df$cluster)
df$cluster = factor(df$cluster)
return(df)
}
df3 = kmeans_clustering(k = 3, input.data = input.data)
df4 = kmeans_clustering(k = 4, input.data = input.data)
df5 = kmeans_clustering(k = 5, input.data = input.data)
df6 = kmeans_clustering(k = 6, input.data = input.data)
# Draw heatmap
kmeans_heatmap = function(input.df, input.data){
df.tmp = input.df$cluster %>% table() %>% data.frame()
my.color=c(colorRampPalette(colors = c("royalblue","white"))(80),
colorRampPalette(colors = c("white","indianred2"))(90))
p =input.data[rownames(input.df),] %>%
pheatmap::pheatmap(cluster_cols = F, cluster_rows = F,
show_rownames = F,
annotation_row = input.df,
col = my.color,
fontsize_row = 6,fontsize_col = 10,
border_color = 'NA',
gaps_col = c(4,8),
gaps_row = cumsum(df.tmp$Freq))
print(p)
}
K = 3
K = 4
K = 5
K = 6
More example analyses will be updated.