RNA-seq analysis workflow

Back to Main Page
Newer version (being updated)

Preparation

Load packages

library(dplyr)
library(ggplot2)
library(DESeq2)
library(edgeR)
library(reshape)
library(dplyr)

Sample data

URL: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE240866

Import data

# Import data
dir = "~/Desktop/DF/public/"

df <- read.table(paste0(dir, "GSE240866_readcounts.txt"), 
                 header=TRUE, sep="\t", stringsAsFactors=FALSE)

# Rename data
df = df %>% select(Geneid, contains("bam"))
colnames(df) = c("gene", paste0("normal", 1:3), paste0("RUX", 1:3))
count.mtx = df
count.mtx = count.mtx[!(duplicated(count.mtx$gene)),]
rownames(count.mtx) = count.mtx$gene
count.mtx = count.mtx[,-1]

# Remove genes with no expression across samples 
count.mtx = count.mtx[rowSums(count.mtx) !=0,]

Principal component analysis

PCA data

se <- SummarizedExperiment(as.matrix(count.mtx), 
                           colData=DataFrame(sample=1:ncol(count.mtx)))
dds <- DESeqDataSet(se, ~ 1)
dds$sample = colnames(dds)
dds$group = c(rep("normal", 3), rep("RUX", 3))
vsd <- vst(dds, blind=FALSE)

pcaData <- DESeq2::plotPCA(vsd, intgroup = "sample", returnData = TRUE)
pcaData$group = dds$group
PCA_var=attr(pcaData, "percentVar")

PCA Plot

ggplot(pcaData, aes(x = PC1, y = PC2, fill = group)) +
  geom_point(size = 4, alpha = 0.6, shape = 21, color = "black", stroke = 0.5)  +
  ggrepel::geom_text_repel(aes(label=name), 
                           color="grey6", size=3, hjust= -0.3, vjust=-0.3) +
  labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
       y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
  theme_bw() +
  theme(legend.title = element_blank()) +
  ggtitle("PCA") +
  labs(caption = " ")

By group

ggplot(pcaData, aes(x = PC1, y = PC2, fill = group)) +
  geom_point(size = 4, alpha = 0.6, shape = 21, color = "black", stroke = 0.5)  +
  ggrepel::geom_text_repel(aes(label=name), 
                           color="grey6", size=3, hjust= -0.3, vjust=-0.3) +
  labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
       y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
  theme_bw() +
  theme(legend.title = element_blank(), legend.position = "none") +
  ggtitle("PCA") +
  labs(caption = " ") +
  facet_wrap(.~group, ncol = 2)

Differentially Expressed Genes (DEG) analysis

DEG data

# Generate info table
info <- data.frame(matrix(nrow = ncol(count.mtx), ncol = 2))
colnames(info) <- c('sample', 'cond')
info$sample <- colnames(count.mtx)
info$cond <- dds$group

# DESeq
dds <- DESeqDataSetFromMatrix(count.mtx, info, ~ cond)
dds <- DESeq(dds)
res <- results(dds)

res <- data.frame(res)

Volcanoplot

fc = 1.5
pval = 0.05

res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & pvalue < pval, 'UP',
                               ifelse(log2FoldChange <= -log2(fc) & pvalue < pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
res %>% 
  ggplot(aes(log2FoldChange, -log10(pvalue), color=DE)) + 
  geom_point(size=1, alpha=0.5) + 
  scale_color_manual(values = c('red','blue','grey')) +
  theme_classic() +
  geom_vline(xintercept = c(-log2(fc),log2(fc)), color='grey') +
  geom_hline(yintercept = -log10(0.05),color='grey') +
  guides(colour = guide_legend(override.aes = list(size=5))) +
  ggtitle(paste0(levels(dds$cond)[2], "/", levels(dds$cond)[1] )) +
  ggeasy::easy_center_title() ## to center title

DEG data table

res %>% DT::datatable(., editable = TRUE, extensions = "Buttons", options = list(dom="frtip", pageLength=10))

Gene set enrichment analysis

Geneset

hallmark <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>% 
  dplyr::select(gs_name, gene_symbol) 

GSEA Function

library(clusterProfiler)

perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
  ranking <- function(res) {
    # Check the name of log2fc related 
    if ("avg_log2FC" %in% names(res)) {
      df <- res$avg_log2FC
    } else if ("log2FoldChange" %in% names(res)) {
      df <- res$log2FoldChange
    } else {
      stop("Neither avg_log2FC nor log2FoldChange columns found in the data frame.")
    }
    names(df) <- rownames(res)
    df <- sort(df, decreasing = TRUE)
    return(df)
  }
  
  ranked.res <- ranking(res)
  
  x <- clusterProfiler::GSEA(geneList = ranked.res,
                             TERM2GENE = ref,
                             pvalueCutoff = pvalueCutoff,
                             pAdjustMethod = "BH",
                             verbose = TRUE,
                             seed = TRUE)
  
  result <- x@result %>% arrange(desc(NES))
  result <- result[, c('NES', 'pvalue', 'p.adjust', 'core_enrichment', 'ID')]
  return(result)
}
gsea.res = perform_GSEA(res = res, ref = hallmark, pvalueCutoff = 1)

GSEA Plot Function

# GSEA Plot 
gsea_nes_plot <- function(gsea.res, title, color="pvalue") {
  gsea.res = gsea.res %>% mutate(sig=ifelse(pvalue <= 0.1, "p value <= 0.1", "p value > 0.1"))
  # basic plot
  p <- gsea.res %>%
    ggplot(aes(reorder(ID, NES), NES)) +
    geom_col(aes(fill=!!sym(color)), color="grey5", size=0.1) +
    coord_flip() +
    labs(x="Pathway", y="Normalized Enrichment Score", title="GSEA") +
    theme_classic() +
    theme(axis.text.x = element_text(size=5, face = 'bold'),
          axis.text.y = element_text(size=6, face = 'bold'),
          axis.title = element_text(size=10)) +
    ggtitle(title)
  
  # color by color input type
  if (color == "pvalue") {
    p <- p + scale_fill_gradient(low = 'orangered', high = '#E5E7E9')
  } else if (color == "sig") {
    p <- p + scale_fill_manual(values = c("orangered", "#E5E7E9"))
  }
  return(p)
}

Plot

GSEA plot 1

gsea_nes_plot(gsea.res = gsea.res, title = "Test1", color = "pvalue") # color by p value

GSEA plot 2

gsea_nes_plot(gsea.res = gsea.res, title = "Test2", color = "sig") # color by significance



More example analyses will be updated.