library(dplyr)
library(ggplot2)
library(DESeq2)
library(edgeR)
library(reshape)
library(dplyr)
Sample data (count matrix)
Calculate TPM sample by sample
for(i in 1:ncol(count_mtx)){
rpkm = count_mtx[,i]/(gene_length/1000)
rpkm_sum = sum(rpkm)
tpm <- rpkm / rpkm_sum * 1e6
tpm_all[,i] = tpm
}
# PCA
se <- SummarizedExperiment(as.matrix(count.mtx),
colData=DataFrame(sample=1:ncol(count.mtx)))
dds <- DESeqDataSet(se, ~ 1)
dds$sample = colnames(dds)
vsd <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup = "sample")
ggplot(pcaData, aes(x = PC1, y = PC2, fill = group)) +
geom_point(size = 4, alpha = 0.6, shape = 21, color = "black", stroke = 0.5) +
ggrepel::geom_text_repel(aes(label=name),
color="grey6", size=3, hjust= -0.3, vjust=-0.3) +
labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
theme_bw() +
theme(legend.title = element_blank()) +
ggtitle("PCA") +
labs(caption = " ")
# Generate info table
info <- data.frame(matrix(nrow = ncol(count.mtx), ncol = 2))
colnames(info) <- c('sample', 'cond')
info$sample <- colnames(count.mtx)
info$cond <- c(rep("CND1",4), rep("CND2",4), rep("CTRL",3))
# DESeq
dds <- DESeqDataSetFromMatrix(count.mtx, info, ~ cond)
dds <- DESeq(dds)
res <- results(dds)
res <- data.frame(res)
fc=1.2
pval=0.05
res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & padj < pval, 'UP',
ifelse(log2FoldChange <= -log2(fc) & padj < pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
legend_data <- data.frame(
log2FoldChange = c(0, 0, 0),
padj = c(0, 0, 0),
DE = c("UP", "DN", "no_sig")
)
res %>%
ggplot(aes(log2FoldChange, -log10(padj))) +
geom_point(size = 0.1, alpha = 0.5, color = "grey") +
geom_point(data = res[res$DE == "UP", ], color = "red", size = 0.2) +
geom_point(data = res[res$DE == "DN", ], color = "blue", size = 0.2) +
geom_vline(xintercept = c(-log2(fc), log2(fc)), color = 'grey') +
geom_hline(yintercept = -log10(pval), color = 'grey') +
##
geom_point(data = legend_data,
aes(color = DE),
size = 0) + #
scale_color_manual(
values = c("UP" = "red", "DN" = "blue", "no_sig" = "grey"),
name = "Significance"
) +
xlim(c(-10, 10)) + ylim(c(0, 75)) +
theme_bw() +
ggtitle("Resist/Naive") +
guides(colour = guide_legend(override.aes = list(size=5))) +
ggtitle("CND1/CTRL") +
ggeasy::easy_center_title()
hallmark <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>%
dplyr::select(gs_name, gene_symbol)
# If you like "entrez_gene", change gene_symbol to entrez_gene
gobp <- msigdbr::msigdbr(species = "Mus musculus", category = "C5") %>%
dplyr::select(gs_name, gene_symbol)
library(clusterProfiler)
perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
ranking <- function(res) {
# Check the name of log2fc related
if ("avg_log2FC" %in% names(res)) {
df <- res$avg_log2FC
} else if ("log2FoldChange" %in% names(res)) {
df <- res$log2FoldChange
} else {
stop("Neither avg_log2FC nor log2FoldChange columns found in the data frame.")
}
names(df) <- rownames(res)
df <- sort(df, decreasing = TRUE)
return(df)
}
ranked.res <- ranking(res)
x <- clusterProfiler::GSEA(geneList = ranked.res,
TERM2GENE = ref,
pvalueCutoff = pvalueCutoff,
pAdjustMethod = "BH",
verbose = TRUE,
seed = TRUE)
result <- x@result %>% arrange(desc(NES))
result <- result[, c('NES', 'pvalue', 'p.adjust', 'core_enrichment', 'ID')]
return(result)
}
# Application
gsea.res1 = perform_GSEA(res = res, ref = hallmark, pvalueCutoff = 1)
# GESA Plot
gsea_nes_plot <- function(gsea.res, title, color="p.adjust") {
gsea.res = gsea.res %>% mutate(sig=ifelse(p.adjust <= 0.05, "FDR <= 0.05", "FDX >= 0.05"))
# basic plot
p <- gsea.res %>%
ggplot(aes(reorder(ID, NES), NES)) +
geom_col(aes(fill=!!sym(color))) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score", title="GSEA") +
theme_classic() +
theme(axis.text.x = element_text(size=5, face = 'bold'),
axis.text.y = element_text(size=6, face = 'bold'),
axis.title = element_text(size=10)) +
ggtitle(title)
# color by color input type
if (color == "p.adjust") {
p <- p + scale_fill_gradient(low = 'red', high = '#E5E7E9')
} else if (color == "sig") {
p <- p + scale_fill_manual(values = c("red", "#E5E7E9"))
}
return(p)
}
# Application
gsea_nes_plot(gsea.res = gsea.res1, title = "Test1", color = "p.adjust") # color by p.adjust
gsea_nes_plot(gsea.res = gsea.res1, title = "Test2", color = "sig") # color by significance
# enrichplot::gseaplot2 function
perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
ranking <- function(res) {
# Check the name of log2fc related
if ("avg_log2FC" %in% names(res)) {
df <- res$avg_log2FC
} else if ("log2FoldChange" %in% names(res)) {
df <- res$log2FoldChange
} else {
stop("Neither avg_log2FC nor log2FoldChange columns found in the data frame.")
}
names(df) <- rownames(res)
df <- sort(df, decreasing = TRUE)
return(df)
}
ranked.res <- ranking(res)
x <- clusterProfiler::GSEA(geneList = ranked.res,
TERM2GENE = ref,
pvalueCutoff = pvalueCutoff,
pAdjustMethod = "BH",
verbose = TRUE,
seed = TRUE)
return(x)
}
gsea.res = perform_GSEA(res = res, ref = hallmark, pvalueCutoff = 1)
# Significant pathways
ids = gsea.res@result %>% filter(p.adjust <= 0.05) %>% rownames()
enrichplot::gseaplot2(gsea.res, geneSetID = ids[1], title = ids[1])
enrichplot::gseaplot2(gsea.res, geneSetID = ids[2], title = ids[2])
enrichplot::gseaplot2(gsea.res, geneSetID = ids[3], title = ids[3])
enrichplot::gseaplot2(gsea.res, geneSetID = ids[4], title = ids[4])
Single sample GSEA
Single-sample Gene Set Enrichment Analysis (ssGSEA) is an variation
of the GSEA algorithm that instead of calculating enrichment scores for
groups of samples (i.e Control vs Disease) and sets of genes (i.e
pathways), it provides a score for each each sample and gene set
pair.
(https://www.genepattern.org/modules/docs/ssGSEAProjection/4#gsc.tab=0)
## Perform ssgsea
library(corto)
## Input data : tpm (count.mtx is accepted as well)
test = ssgsea(tpm,hallmarkList)
## Reshape it to plot
test.df = test %>% t() %>% data.frame()
rownames(test.df) = colnames(tpm)
## p value of ssgsea
pval = corto::z2p(test)
colnames(pval) = rownames(test.df)
Color represents enrichment score
## Heatmap
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(60),
colorRampPalette(colors = c("white","#D35400"))(70))
t(test.df) %>% pheatmap::pheatmap(color = my.color,
cluster_rows = F,
cluster_cols = F,
main = "ssGSEA of HALLMARK pathways",
gaps_col = c(3,6)) # Simple heatmap
Red: p value <= 0.05 (significant)
White : p value > 0.05 (not significant)
## Heatmap
input.data = (pval<=0.05)
input.data[] <- +input.data
input.data %>% pheatmap::pheatmap(color = c("grey99","salmon"),
cluster_rows = F,
cluster_cols = F,
main = "Significance by p value",
gaps_col = c(3,6)) # Simple heatmap
# Plot it
genesetname = "WP Apoptosis"
test.df = test.df %>%
mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
colorRampPalette(colors = c("white","#D35400"))(40))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F,
color=my.color, border_color = "grey33",
gaps_col = 1,
main = "ssGSEA with significance")
# Plot it
genesetname = "GPBP DNA Damage"
test.df = test.df %>%
mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
colorRampPalette(colors = c("white","#D35400"))(60))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F,
color=my.color, border_color = "grey33",
gaps_col = 1,
main = "ssGSEA with significance")
# Import sample data
tpm.sample = read.csv("../sampledata/tpm.sample.csv", row.names = 1)
cellmarker = read.csv('../info/DB/CellMarker2.0.csv')
celltypes = c("B cell",
"CD8+ T cell",
"Activated T cell",
"Cancer cell",
"CD4+ T cell",
"Dendritic cell",
"Macrophage")
# Functions
mks.violinplot = function(cellname, tpm=tpm.sample){
rs <- grepl(cellname, cellmarker$cell_name, ignore.case = TRUE)
mks = cellmarker[rs, ]$marker
mks = mks[mks %in% rownames(tpm)]
tpm.df =tpm %>% filter(rownames(.) %in% mks)
tpm.df$gene = rownames(tpm.df)
p= tpm.df %>% reshape::melt() %>% ggplot(aes(.[,2], log10(.[,3]))) +
geom_violin(aes(fill = variable), alpha = 0.1, show.legend = FALSE) +
geom_boxplot(outlier.size = 0, width = 0.2, alpha = 0.5, show.legend = FALSE) +
geom_jitter(aes(color = variable), alpha = 0.5, size = 1, show.legend = FALSE) +
scale_y_continuous(labels = scales::comma) +
theme_bw() +
labs(y = paste0(cellname, " genes (log10 scaled)"),
title = paste0(cellname, " signature genes ")) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust=1, size=14),
panel.grid.major.x = element_blank(),
axis.title.y = element_text(size = 14), # y 축 레이블 폰트 크기 설정
plot.title = element_text(size = 20)
)
return(p)
}
mks.violinplot(cellname = "B cell", tpm=tpm.sample)
mks.violinplot(cellname = "Dendritic cell", tpm=tpm.sample)
# Function 1
# condition1 = CTRL
DEG_out = function(count.raw= count.mtx, condition1, condition2){
df1 = count.raw %>% select(contains(condition1))
df2 = count.raw %>% select(contains(condition2))
count.mtx = cbind(df1,df2)
# Generate info table
info <- data.frame(matrix(nrow = ncol(count.mtx), ncol = 2))
colnames(info) <- c('sample', 'cond')
info$sample <- colnames(count.mtx)
info = info %>% mutate(cond = ifelse(sample %in% colnames(df1), condition1,condition2))
info$cond = factor(info$cond, levels = c(condition1, condition2))
# info
# DESeq: DEG analysis
dds <- DESeqDataSetFromMatrix(count.mtx, info, ~ cond)
dds <- DESeq(dds)
res <- results(dds)
# make DEG output a data frame for downstream analysis
res <- data.frame(res)
res <- res %>% filter(baseMean != 0)
# p value == NA adjustment
res[is.na(res)] <- 1
# res[is.na(res$pvalue),]
return(res)
}
# Function 2 for volcanoplot
DEG_volcanoplot = function(res= res ,pval=0.05, fc = 1.2,
condition1=condition1,
condition2=condition2){
res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & pvalue <= pval, 'UP',
ifelse(log2FoldChange <= -log2(fc) & pvalue <= pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
# Draw plot
res %>%
filter(pvalue !=1) %>% # remove the NA values
ggplot(aes(log2FoldChange, -log10(pvalue), color=DE)) +
geom_point(size=1, alpha=0.5) +
scale_color_manual(values = c('red','blue','grey')) +
theme_classic() +
geom_vline(xintercept = c(-log2(fc),log2(fc)), color='grey') +
geom_hline(yintercept = -log10(0.05),color='grey') +
guides(colour = guide_legend(override.aes = list(size=5))) +
ggtitle(paste0(condition2, " / ", condition1)) +
ggeasy::easy_center_title() ## to center title
}
# Function 3
DEG_info = function(res, n=20, pval=0.05, fc = 1.2){
res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & pvalue <= pval, 'UP',
ifelse(log2FoldChange <= -log2(fc) & pvalue <= pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
df = res$DE %>% table() %>% data.frame()
colnames(df) = c("DEG", "Number of genes")
up = res %>% filter(DE == "UP") %>% arrange(desc(log2FoldChange)) %>%
rownames() %>% head(n) %>% paste0(collapse = ",")
dn = res %>% filter(DE == "DN") %>% arrange(log2FoldChange) %>%
rownames() %>% head(n) %>% paste0(collapse = ",")
df$top_genes_by_Log2FC =""
df[1,3] = up
df[2,3] = dn
return(df)
}
DEG_table=function(res, pval=0.05, fc = 1.2){
res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & pvalue <= pval, 'UP',ifelse(log2FoldChange <= -log2(fc) & pvalue <= pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
return(res)
}
From three comparisons
Significant genes X samples matrix was used for kmeans clustering to
identify any patterns across data.
Clustering method: Kmeans
Adjustable parameter : k (number of clusters)
tpm = read.csv("../sampledata/RNAseq_sample_tpm.csv", row.names = 1)
tpm = tpm[significant_genes_all,]
############## kmeans clustering #####################
## zscore function
zscore <- function(input.data = input.data){
input.data.rowsums <- rowSums(input.data)
input.data.mean <- rowMeans(input.data)
input.data.sd <- matrixStats::rowSds(as.matrix(input.data))
names(input.data.sd) <- rownames(input.data)
zscore <- (input.data-input.data.mean)/input.data.sd
return(zscore)
}
# Create input data with zscaled
input.data <- zscore(tpm[significant_genes_all,])
input.data <- na.omit(input.data)
# Perform kmeans clustering
kmeans_clustering = function(k=3, input.data= input.data){
set.seed(1234)
fitout <- kmeans(input.data, centers = k, nstart = k)
fitout.cluster <- fitout$cluster
# table(fitout.cluster)
df = fitout.cluster %>% as.data.frame()
colnames(df) = "cluster"
rownames(df) = names(fitout.cluster)
df = df %>% arrange(cluster)
df$cluster = paste0("Cluster", df$cluster)
df$cluster = factor(df$cluster)
return(df)
}
df3 = kmeans_clustering(k = 3, input.data = input.data)
df4 = kmeans_clustering(k = 4, input.data = input.data)
df5 = kmeans_clustering(k = 5, input.data = input.data)
df6 = kmeans_clustering(k = 6, input.data = input.data)
# Draw heatmap
kmeans_heatmap = function(input.df, input.data){
df.tmp = input.df$cluster %>% table() %>% data.frame()
my.color=c(colorRampPalette(colors = c("royalblue","white"))(80),
colorRampPalette(colors = c("white","indianred2"))(90))
p =input.data[rownames(input.df),] %>%
pheatmap::pheatmap(cluster_cols = F, cluster_rows = F,
show_rownames = F,
annotation_row = input.df,
col = my.color,
fontsize_row = 6,fontsize_col = 10,
border_color = 'NA',
gaps_col = c(4,8),
gaps_row = cumsum(df.tmp$Freq))
print(p)
}