Back to Main Page

Preparation

Load packages

library(dplyr)
library(ggplot2)
library(DESeq2)
library(edgeR)
library(reshape)
library(dplyr)

Sample data

Sample data (count matrix)

TPM calculation

Calculate TPM sample by sample

for(i in 1:ncol(count_mtx)){   
  rpkm = count_mtx[,i]/(gene_length/1000)   
  rpkm_sum = sum(rpkm)   
  tpm <- rpkm / rpkm_sum * 1e6   
  tpm_all[,i] = tpm   
}  

Principal component analysis

# PCA
se <- SummarizedExperiment(as.matrix(count.mtx), 
                           colData=DataFrame(sample=1:ncol(count.mtx)))
dds <- DESeqDataSet(se, ~ 1)
dds$sample = colnames(dds)
vsd <- vst(dds, blind=FALSE)

pcaData <- plotPCA(vsd, intgroup = "sample")
ggplot(pcaData, aes(x = PC1, y = PC2, fill = group)) +
  geom_point(size = 4, alpha = 0.6, shape = 21, color = "black", stroke = 0.5)  +
  ggrepel::geom_text_repel(aes(label=name), 
                           color="grey6", size=3, hjust= -0.3, vjust=-0.3) +
  labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
       y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  ggtitle("PCA") +
  labs(caption = " ")

Differentially Expressed Genes (DEG) analysis

DEG data

# Generate info table
info <- data.frame(matrix(nrow = ncol(count.mtx), ncol = 2))
colnames(info) <- c('sample', 'cond')
info$sample <- colnames(count.mtx)
info$cond <- c(rep("CND1",4), rep("CND2",4), rep("CTRL",3))

# DESeq
dds <- DESeqDataSetFromMatrix(count.mtx, info, ~ cond)
dds <- DESeq(dds)
res <- results(dds)

Visualization of DEGs Volcanoplot

res <- data.frame(res)
fc=1.2
pval=0.05
res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & padj < pval, 'UP',
                               ifelse(log2FoldChange <= -log2(fc) & padj < pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))

legend_data <- data.frame(
  log2FoldChange = c(0, 0, 0),  
  padj = c(0, 0, 0),           
  DE = c("UP", "DN", "no_sig")    
)

res %>% 
  ggplot(aes(log2FoldChange, -log10(padj))) + 
  geom_point(size = 0.1, alpha = 0.5, color = "grey") + 
  geom_point(data = res[res$DE == "UP", ], color = "red", size = 0.2) +
  geom_point(data = res[res$DE == "DN", ], color = "blue", size = 0.2) +
  geom_vline(xintercept = c(-log2(fc), log2(fc)), color = 'grey') +
  geom_hline(yintercept = -log10(pval), color = 'grey') +
  ##
  geom_point(data = legend_data, 
             aes(color = DE), 
             size = 0) + # 
  scale_color_manual(
    values = c("UP" = "red", "DN" = "blue", "no_sig" = "grey"),
    name = "Significance"
  ) +
  xlim(c(-10, 10)) + ylim(c(0, 75)) +
  theme_bw() +
  ggtitle("Resist/Naive") +
  guides(colour = guide_legend(override.aes = list(size=5))) +
  ggtitle("CND1/CTRL") +
  ggeasy::easy_center_title() 

Gene set enrichment analysis

Geneset

hallmark <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>% 
  dplyr::select(gs_name, gene_symbol) 
# If you like "entrez_gene", change gene_symbol to entrez_gene
gobp <- msigdbr::msigdbr(species = "Mus musculus", category = "C5") %>% 
  dplyr::select(gs_name, gene_symbol)

Perform GSEA

library(clusterProfiler)

perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
  ranking <- function(res) {
    # Check the name of log2fc related 
    if ("avg_log2FC" %in% names(res)) {
      df <- res$avg_log2FC
    } else if ("log2FoldChange" %in% names(res)) {
      df <- res$log2FoldChange
    } else {
      stop("Neither avg_log2FC nor log2FoldChange columns found in the data frame.")
    }
    names(df) <- rownames(res)
    df <- sort(df, decreasing = TRUE)
    return(df)
  }
  
  ranked.res <- ranking(res)
  
  x <- clusterProfiler::GSEA(geneList = ranked.res,
                             TERM2GENE = ref,
                             pvalueCutoff = pvalueCutoff,
                             pAdjustMethod = "BH",
                             verbose = TRUE,
                             seed = TRUE)
  
  result <- x@result %>% arrange(desc(NES))
  result <- result[, c('NES', 'pvalue', 'p.adjust', 'core_enrichment', 'ID')]
  return(result)
}


# Application 
gsea.res1 = perform_GSEA(res = res, ref = hallmark, pvalueCutoff = 1)

NES plot

# GESA Plot 
gsea_nes_plot <- function(gsea.res, title, color="p.adjust") {
  gsea.res = gsea.res %>% mutate(sig=ifelse(p.adjust <= 0.05, "FDR <= 0.05", "FDX >= 0.05"))
  # basic plot
  p <- gsea.res %>%
    ggplot(aes(reorder(ID, NES), NES)) +
    geom_col(aes(fill=!!sym(color))) +
    coord_flip() +
    labs(x="Pathway", y="Normalized Enrichment Score", title="GSEA") +
    theme_classic() +
    theme(axis.text.x = element_text(size=5, face = 'bold'),
          axis.text.y = element_text(size=6, face = 'bold'),
          axis.title = element_text(size=10)) +
    ggtitle(title)

  # color by color input type
  if (color == "p.adjust") {
    p <- p + scale_fill_gradient(low = 'red', high = '#E5E7E9')
  } else if (color == "sig") {
    p <- p + scale_fill_manual(values = c("red", "#E5E7E9"))
  }
  return(p)
}

# Application 

gsea_nes_plot(gsea.res = gsea.res1, title = "Test1", color = "p.adjust") # color by p.adjust
gsea_nes_plot(gsea.res = gsea.res1, title = "Test2", color = "sig") # color by significance

Enrichment plot for the individual pathway

# enrichplot::gseaplot2 function 
perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
  ranking <- function(res) {
    # Check the name of log2fc related 
    if ("avg_log2FC" %in% names(res)) {
      df <- res$avg_log2FC
    } else if ("log2FoldChange" %in% names(res)) {
      df <- res$log2FoldChange
    } else {
      stop("Neither avg_log2FC nor log2FoldChange columns found in the data frame.")
    }
    names(df) <- rownames(res)
    df <- sort(df, decreasing = TRUE)
    return(df)
  }
  
  ranked.res <- ranking(res)
  
  x <- clusterProfiler::GSEA(geneList = ranked.res,
                             TERM2GENE = ref,
                             pvalueCutoff = pvalueCutoff,
                             pAdjustMethod = "BH",
                             verbose = TRUE,
                             seed = TRUE)
  return(x)
}

gsea.res = perform_GSEA(res = res, ref = hallmark, pvalueCutoff = 1)

# Significant pathways 
ids = gsea.res@result %>% filter(p.adjust <= 0.05) %>% rownames()
enrichplot::gseaplot2(gsea.res, geneSetID = ids[1], title = ids[1])
enrichplot::gseaplot2(gsea.res, geneSetID = ids[2], title = ids[2])
enrichplot::gseaplot2(gsea.res, geneSetID = ids[3], title = ids[3])
enrichplot::gseaplot2(gsea.res, geneSetID = ids[4], title = ids[4])

ssGSEA

Single sample GSEA

Single-sample Gene Set Enrichment Analysis (ssGSEA) is an variation of the GSEA algorithm that instead of calculating enrichment scores for groups of samples (i.e Control vs Disease) and sets of genes (i.e pathways), it provides a score for each each sample and gene set pair.
(https://www.genepattern.org/modules/docs/ssGSEAProjection/4#gsc.tab=0)


## Perform ssgsea 
library(corto)

## Input data : tpm (count.mtx is accepted as well) 
test = ssgsea(tpm,hallmarkList)

## Reshape it to plot 
test.df = test %>% t() %>% data.frame()
rownames(test.df) = colnames(tpm)

## p value of ssgsea 
pval = corto::z2p(test)
colnames(pval) = rownames(test.df)

ssGSEA heatmap

Color represents enrichment score


## Heatmap 
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(60),
           colorRampPalette(colors = c("white","#D35400"))(70))
t(test.df) %>% pheatmap::pheatmap(color = my.color, 
                                  cluster_rows = F,
                                  cluster_cols = F,
                                  main = "ssGSEA of HALLMARK pathways",
                                  gaps_col = c(3,6)) # Simple heatmap 

Significance by p value Heatmap

Red: p value <= 0.05 (significant)
White : p value > 0.05 (not significant)

## Heatmap 
input.data = (pval<=0.05)
input.data[] <- +input.data
input.data %>% pheatmap::pheatmap(color = c("grey99","salmon"), 
                                  cluster_rows = F,
                                  cluster_cols = F, 
                                  main = "Significance by p value",
                                  gaps_col = c(3,6)) # Simple heatmap 

Single pathway ssGSEA

Pathway sample 1

# Plot it 
genesetname = "WP Apoptosis"
test.df = test.df %>% 
  mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
           colorRampPalette(colors = c("white","#D35400"))(40))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F, 
                             color=my.color, border_color = "grey33", 
                             gaps_col = 1, 
                             main = "ssGSEA with significance")

Pathway sample 2

# Plot it 
genesetname = "GPBP DNA Damage"
test.df = test.df %>% 
  mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
           colorRampPalette(colors = c("white","#D35400"))(60))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F, 
                             color=my.color, border_color = "grey33", 
                             gaps_col = 1, 
                             main = "ssGSEA with significance")

Multiple genes-violin plot across samples

# Import sample data 
tpm.sample = read.csv("../sampledata/tpm.sample.csv", row.names = 1)
cellmarker = read.csv('../info/DB/CellMarker2.0.csv')

Functions

celltypes = c("B cell",
              "CD8+ T cell",
              "Activated T cell",
              "Cancer cell",
              "CD4+ T cell",
              "Dendritic cell",
              "Macrophage")

# Functions  
mks.violinplot = function(cellname, tpm=tpm.sample){
  rs <- grepl(cellname, cellmarker$cell_name, ignore.case = TRUE)
  
  mks = cellmarker[rs, ]$marker 
  mks = mks[mks %in% rownames(tpm)]
  
  tpm.df =tpm %>% filter(rownames(.) %in% mks)
  tpm.df$gene = rownames(tpm.df)
  
  p= tpm.df %>% reshape::melt() %>% ggplot(aes(.[,2], log10(.[,3]))) +
    
    geom_violin(aes(fill = variable), alpha = 0.1, show.legend = FALSE) +
    geom_boxplot(outlier.size = 0, width = 0.2, alpha = 0.5, show.legend = FALSE) +
    geom_jitter(aes(color = variable), alpha = 0.5, size = 1, show.legend = FALSE) +
    scale_y_continuous(labels = scales::comma) +
    theme_bw() +
    labs(y = paste0(cellname, "  genes (log10 scaled)"),
         title = paste0(cellname, " signature genes ")) +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 90, hjust=1, size=14),
      panel.grid.major.x = element_blank(),
      axis.title.y = element_text(size = 14),  # y 축 레이블 폰트 크기 설정
      plot.title = element_text(size = 20)
    )
  return(p)
}

Plot1

mks.violinplot(cellname = "B cell", tpm=tpm.sample)

Plot2

mks.violinplot(cellname = "Dendritic cell", tpm=tpm.sample)

DEG analysis sample analysis

Functions

# Function 1 
# condition1 = CTRL 
DEG_out = function(count.raw= count.mtx, condition1, condition2){
  df1 = count.raw %>% select(contains(condition1)) 
  df2 = count.raw %>% select(contains(condition2)) 
  
  count.mtx = cbind(df1,df2)
  
  # Generate info table
  info <- data.frame(matrix(nrow = ncol(count.mtx), ncol = 2))
  colnames(info) <- c('sample', 'cond')
  info$sample <- colnames(count.mtx)
  info = info %>% mutate(cond = ifelse(sample %in% colnames(df1), condition1,condition2)) 
  info$cond = factor(info$cond, levels = c(condition1, condition2))
  # info
  
  # DESeq: DEG analysis 
  dds <- DESeqDataSetFromMatrix(count.mtx, info, ~ cond)
  dds <- DESeq(dds)
  res <- results(dds)
  
  # make DEG output a data frame for downstream analysis
  res <- data.frame(res)
  res <- res %>% filter(baseMean != 0)
  
  # p value == NA adjustment 
  res[is.na(res)] <- 1
  
  # res[is.na(res$pvalue),]
  return(res)
}


# Function 2 for volcanoplot  
DEG_volcanoplot = function(res= res ,pval=0.05, fc = 1.2, 
                           condition1=condition1, 
                           condition2=condition2){
  res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & pvalue <= pval, 'UP',
                                 ifelse(log2FoldChange <= -log2(fc) & pvalue <= pval, 'DN','no_sig')))
  res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
  
  # Draw plot 
  res %>% 
    filter(pvalue !=1) %>% # remove the NA values 
    ggplot(aes(log2FoldChange, -log10(pvalue), color=DE)) + 
    geom_point(size=1, alpha=0.5) + 
    scale_color_manual(values = c('red','blue','grey')) +
    theme_classic() +
    geom_vline(xintercept = c(-log2(fc),log2(fc)), color='grey') +
    geom_hline(yintercept = -log10(0.05),color='grey') +
    guides(colour = guide_legend(override.aes = list(size=5))) +
    ggtitle(paste0(condition2, " / ", condition1)) +
    ggeasy::easy_center_title() ## to center title
  
}

# Function 3 
DEG_info = function(res, n=20, pval=0.05, fc = 1.2){
  res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & pvalue <= pval, 'UP',
                                 ifelse(log2FoldChange <= -log2(fc) & pvalue <= pval, 'DN','no_sig')))
  res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
  
  df = res$DE %>% table() %>% data.frame()
  colnames(df) = c("DEG", "Number of genes")
  
  up = res %>% filter(DE == "UP") %>% arrange(desc(log2FoldChange)) %>% 
    rownames() %>% head(n) %>% paste0(collapse = ",")
  dn = res %>% filter(DE == "DN") %>% arrange(log2FoldChange) %>% 
    rownames() %>% head(n) %>% paste0(collapse = ",")
  
  df$top_genes_by_Log2FC =""
  df[1,3] = up
  df[2,3] = dn
  
  return(df)
}

DEG_table=function(res, pval=0.05, fc = 1.2){
  res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & pvalue <= pval, 'UP',ifelse(log2FoldChange <= -log2(fc) & pvalue <= pval, 'DN','no_sig')))
  res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
  return(res)
}

DEG CND1 vs CTRL

  1. DEG information table
  2. Volcanoplot
  3. DEG output.

DEG CND2 vs CTRL

  1. DEG information table
  2. Volcanoplot
  3. DEG output.

DEG CND2 vs CND1

  1. DEG information table
  2. Volcanoplot
  3. DEG output.

Select the signicant DEGs for the next step

From three comparisons

  • DEG1
  • DEG2
  • DEG3



Kmeans clustering

Significant genes X samples matrix was used for kmeans clustering to identify any patterns across data.
Clustering method: Kmeans
Adjustable parameter : k (number of clusters)

tpm = read.csv("../sampledata/RNAseq_sample_tpm.csv", row.names = 1)
tpm = tpm[significant_genes_all,]

############## kmeans clustering #####################
## zscore function 
zscore <- function(input.data = input.data){
  input.data.rowsums <- rowSums(input.data)
  input.data.mean <- rowMeans(input.data)
  input.data.sd <- matrixStats::rowSds(as.matrix(input.data))
  names(input.data.sd) <- rownames(input.data)
  zscore <- (input.data-input.data.mean)/input.data.sd
  return(zscore)
}

# Create input data with zscaled 
input.data <- zscore(tpm[significant_genes_all,])
input.data <- na.omit(input.data)


# Perform kmeans clustering 
kmeans_clustering = function(k=3, input.data= input.data){
  set.seed(1234)
  fitout <- kmeans(input.data, centers = k, nstart = k)
  fitout.cluster <- fitout$cluster
  # table(fitout.cluster)
  df = fitout.cluster %>% as.data.frame()
  colnames(df) = "cluster"
  rownames(df) = names(fitout.cluster)
  df = df %>% arrange(cluster)
  df$cluster = paste0("Cluster", df$cluster)
  df$cluster = factor(df$cluster)
  return(df)
}

df3 = kmeans_clustering(k = 3, input.data = input.data)
df4 = kmeans_clustering(k = 4, input.data = input.data)
df5 = kmeans_clustering(k = 5, input.data = input.data)
df6 = kmeans_clustering(k = 6, input.data = input.data)

# Draw heatmap 
kmeans_heatmap = function(input.df, input.data){
  df.tmp = input.df$cluster %>% table() %>% data.frame()
  my.color=c(colorRampPalette(colors = c("royalblue","white"))(80),
             colorRampPalette(colors = c("white","indianred2"))(90))
  
  p =input.data[rownames(input.df),] %>% 
    pheatmap::pheatmap(cluster_cols = F, cluster_rows = F,
                       show_rownames = F,
                       annotation_row = input.df,
                       col = my.color,
                       fontsize_row = 6,fontsize_col = 10, 
                       border_color = 'NA', 
                       gaps_col = c(4,8), 
                       gaps_row = cumsum(df.tmp$Freq))
  print(p)
}

K = 3

K = 4

K = 5

K = 6