cellranger : 10x Genomics’s Chromium system to
process single cell RNA-seq raw data. Easy to use but need large
computing resources and Linux environment.
Input : Fastq files, cellranger genome index
# cellranger count for transcriptome data processing
cellranger count --id=SampleA_GEX \
--transcriptome=/path/to/reference/transcriptomes/Mouse_GEX_2020/refdata-gex-mm10-2020-A \
--fastqs=/path/to/raw_data/SampleA \
--sample=SampleA \
--expect-cells=10000
# cellranger aggr to aggregate multiple cellranger count output to one file
cellranger aggr --id=Aggregate_GEX \
--csv=/path/to/aggregate_info/Aggregate_GEX.csv
library_id,molecule_h5
SampleA,/path/to/SampleA/outs/molecule_info.h5
SampleB,/path/to/SampleB/outs/molecule_info.h5
SampleC,/path/to/SampleC/outs/molecule_info.h5
Aggregated Feature-Barcode Matrices
matrix.mtx
genes.tsv
barcodes.tsv
library(dplyr)
library(ggplot2)
library(reshape)
library(dplyr)
Data import from the cellranger output (above).
Create Seurat Object
library(Seurat)
obj.raw = Read10X('raw_data/Path/filtered_feature_bc_matrix/')
obj.srt = CreateSeuratObject(counts = obj.raw, project = 'project_name')
scrublet runs in python env.
Input file is the count matrix from seurat object.
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
def run_scrublet(input_file, output_file, expected_doublet_rate=0.1):
df = pd.read_csv(input_file, header=0, index_col=0)
adata = sc.AnnData(df)
# Set the expected_doublet_rate parameter
sc.external.pp.scrublet(adata, expected_doublet_rate=expected_doublet_rate)
# Save the observation (results) to CSV
adata.obs.to_csv(output_file)
input_file = 'input.count.csv'
output_file = 'output.scr.csv'
# Adjust the expected_doublet_rate parameter
expected_doublet_rate = 0.1
run_scrublet(input_file, output_file, expected_doublet_rate)
Add the scrublet output information to the seurat object
## import scrublet result
df= read.csv('path/output.scr.csv', row.names = 1)
# If the previous process changes the name of cell ID,
# Use the following code to correct it.
rownames(df) = gsub(pattern = '_', replacement = '-' ,rownames(df))
## add doublet info to the srt obj
obj.srt[['doublet_score']] = df[rownames(obj.srt@meta.data),]$doublet_score
obj.srt[['predicted_doublet']] =df[rownames(obj.srt@meta.data),]$predicted_doublet
obj.srt %>% saveRDS("path/saved.obj.rds")
# You might want to check the number of doublets in the data
obj.srt@meta.data %>% select(predicted_doublet) %>% table()
This process includes the following.
FindVariableFeatures(“vst” method)
NormalizeData
ScaleData
RunPCA
FindNeighbors
FindClusters
RunUMAP
## perform default analysis
perform_default_analysis <- function(obj.srt, n_features = 2000, n_pcs = 20,
dims_for_neighbors = 1:20,
resolutions = c(0.1,0.5,1),
umap_dims = 1:20) {
# Step 1: Find variable features
obj.srt <- FindVariableFeatures(obj.srt,
selection.method = 'vst',
nfeatures = n_features)
# Step 2: Scale and normalize data
all_genes <- rownames(obj.srt)
obj.srt <- NormalizeData(obj.srt)
obj.srt <- ScaleData(obj.srt, features = all_genes)
# Step 3: Run PCA
obj.srt <- RunPCA(obj.srt,
features = VariableFeatures(object = obj.srt), npcs = n_pcs)
# Step 4: Find neighbors
obj.srt <- FindNeighbors(obj.srt, dims = dims_for_neighbors)
# Step 5: Find clusters
obj.srt <- FindClusters(obj.srt, resolution = resolutions)
# Step 6: Run UMAP
obj.srt <- RunUMAP(obj.srt, dims = umap_dims)
# Return the Seurat object with analysis results
return(obj.srt)
}
# apply
obj.srt <- perform_default_analysis(obj.srt)
Seurat v5 uses IntegrateLayers() for batch correction.
Harmony is recommended for most datasets due to its speed and
robustness.
Reference: https://satijalab.org/seurat/articles/seurat5_integration
library(harmony)
# Define batch variable
batch_var <- "Timepoint"
# Step 1: Split RNA assay into layers by batch (Seurat v5)
obj.srt[["RNA"]] <- split(obj.srt[["RNA"]], f = obj.srt@meta.data[[batch_var]])
# Step 2: Standard preprocessing
obj.srt <- NormalizeData(obj.srt)
obj.srt <- FindVariableFeatures(obj.srt)
obj.srt <- ScaleData(obj.srt)
obj.srt <- RunPCA(obj.srt, npcs = 30)
# Step 3: Harmony integration via IntegrateLayers
obj.srt <- IntegrateLayers(
object = obj.srt,
method = HarmonyIntegration,
orig.reduction = "pca",
new.reduction = "harmony",
group.by.vars = batch_var,
verbose = FALSE
)
# Step 4: Re-join layers after integration
obj.srt <- JoinLayers(obj.srt)
# Step 5: Downstream analysis using harmony reduction
obj.srt <- FindNeighbors(obj.srt, reduction = "harmony", dims = 1:30)
obj.srt <- FindClusters(obj.srt, resolution = c(0.1, 0.2, 0.5, 0.8))
obj.srt <- RunUMAP(obj.srt, reduction = "harmony", dims = 1:30)
obj.srt %>% saveRDS("path/saved.obj.rds")
# Visualization
DimPlot(obj.srt, reduction = "umap", group.by = batch_var)
Other integration methods
RPCA (Seurat built-in, faster than CCA): use
method = RPCAIntegration
LIGER: https://github.com/welch-lab/liger
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
obj.srt <- CellCycleScoring(obj.srt,
s.features = s.genes, g2m.features = g2m.genes)
URL: https://github.com/IanevskiAleksandr/sc-type
# load gene set preparation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
# load cell type annotation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
# DB file
db_ <- "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx";
tissue <- "Immune system" # e.g. Immune system,Pancreas,Liver,Kidney,Brain,Lung
# prepare gene sets
gs_list <- gene_sets_prepare(db_, tissue)
# extract scaled scRNA-seq matrix
scRNAseqData_scaled <- as.matrix(obj.srt[["RNA"]]@scale.data)
# run ScType
es.max <- sctype_score(scRNAseqData = scRNAseqData_scaled,
scaled = TRUE,
gs = gs_list$gs_positive,
gs2 = gs_list$gs_negative)
# Merge by cluster
cL_results <- do.call("rbind",
lapply(unique(obj.srt@meta.data$seurat_clusters), function(cl) {
cl_data <- obj.srt@meta.data[obj.srt@meta.data$seurat_clusters == cl, ]
es_sum <- rowSums(es.max[, rownames(cl_data)])
es_sorted <- sort(es_sum, decreasing = TRUE)
top_es <- head(data.frame(cluster = cl,
type = names(es_sorted),
scores = es_sorted,
ncells = nrow(cl_data)), 10)
return(top_es)
})
)
# Calculate top scores by cluster
sctype_scores <- cL_results %>%
group_by(cluster) %>%
top_n(n = 1, wt = scores)
# Set low-confidence clusters to "Unknown"
threshold <- sctype_scores$ncells / 4
sctype_scores$type[sctype_scores$scores < threshold] <- "Unknown"
# overlay the identified cell types on UMAP plot
obj.srt@meta.data$sctype_classification = ""
# Assign cell type classification to meta data
unique_clusters <- unique(sctype_scores$cluster)
for (j in unique_clusters) {
# Extract the type for the current cluster
cl_type <- sctype_scores[sctype_scores$cluster == j, ]
type_label <- as.character(cl_type$type[1])
# Assign type label to all cells in the cluster
obj.srt@meta.data$sctype_classification[obj.srt@meta.data$seurat_clusters == j] <- type_label
}
# Draw plot
DimPlot(obj.srt, reduction = "umap", label = TRUE, repel = TRUE,
group.by = 'sctype_classification')
To assess the activation of gene sets (custom) in assigned cell types (by scType), I calculate AUC scores by determining the level of activation of specific gene sets in each cell.
library(Seurat)
library(AUCell)
library(GSEABase)
# import data from single-cell experiment
# Seurat v5: use LayerData() instead of @assays$RNA@data
exprMatrix <- LayerData(obj.srt, assay = "RNA", layer = "data")
# Convert to sparse:
exprMatrix <- as(exprMatrix, "dgCMatrix")
# Create genesets to test
# getting them from msigDB or make custom ones
gmt
cells_rankings <- AUCell_buildRankings(exprMatrix, plotStats=TRUE)
cells_AUC <- AUCell_calcAUC(gmt, cells_rankings)
cells_assignment <- AUCell_exploreThresholds(cells_AUC, plotHist=TRUE, assign=TRUE)
# check the length inside
n <- length(cells_assignment)
aucThr <- matrix(nrow = n, ncol = 1) # 필요한 경우 열의 수를 조정하세요.
# This is an example rownames
rownames(aucThr) <- c("B", "Fibroblast", "Myeloid", "T_NK", "Tumor")[1:n]
for (i in 1:n) {
aucThr[i, ] <- cells_assignment[[i]]$aucThr$selected
}
# Data frame
aucThr %>% data.frame() %>% DT::datatable()
# Add the calculated AUC scores as metadata to the Seurat object
obj.srt <- AddMetaData(obj.srt, data.frame(t(cells_AUC@assays@data$AUC)))
# This is an additional part
Category = c("B" ,"Fibroblast","Myeloid","T_NK","Tumor")
for (cat in Category) {
rows <- cells_assignment[[cat]]$assignment
auc_column_name <- paste0("AUC_designated_", cat)
obj.srt@meta.data[, auc_column_name] <- "FALSE"
obj.srt@meta.data[rows, auc_column_name] <- "TRUE"
}
Note
Once obtaining AUC output, I check the information that can be transferred to seurat object metadata. There should be various visualization methods for the transferred data. I usually use UMAP visualization to check the general patterns of them.
DimPlot(obj.srt, group.by = "AUC_designated_my_category")
FeaturePlot(obj.srt, features = "AUC_score_for_my_geneset")
# Find markers based on the selected cluster
Idents(obj.srt) = 'RNA_snn_res.0.1'
fc=1.2
all.markers = FindAllMarkers(obj.srt, logfc.threshold = log2(fc),
only.pos = T, test.use = 'wilcox',min.pct = 0.5)
all.markers %>% DT::datatable(extensions = "Buttons",
options = list(autoWidth = TRUE,
fixedHeader = TRUE,
dom="Bfrtip", buttons=c("csv","excel")))
gs = genes_selected # Selected genes from the marker list
obj.srt$rev.res.0.1 = factor(obj.srt$RNA_snn_res.0.1,
levels = rev(levels(obj.srt$RNA_snn_res.0.1)))
DotPlot(obj.srt, group.by = "rev.res.0.1",
features = gs, cols = c("grey","red")) +RotatedAxis()
gs = genes_selected # Selected genes from the marker list
DoHeatmap(obj.srt, features = gs, group.by = "RNA_snn_res.0.1")
# Generating DEGs bewteen condition 1 and condition 2
DEG.two_groups= function(obj.srt,cluster=0, cond1="ADU", cond2="CTL"){
obj.srt@meta.data=obj.srt@meta.data %>%
mutate(compare = ifelse(RNA_snn_res.0.1 == cluster & sample == cond1,
paste0(cluster,"_", cond1),
ifelse(RNA_snn_res.0.1 == cluster & sample == cond2,
paste0(cluster,"_", cond2), "others")))
g2=paste0(cluster,"_", cond1)
g1=paste0(cluster,"_", cond2)
Idents(obj.srt) = 'compare'
logfc=log2(1)
mks =FindMarkers(obj.srt, ident.1 = g2, ident.2 = g1,
logfc.threshold = logfc, assay = "count")
pval=0.05
fc=1.2
mks = mks %>%
mutate(DE=ifelse(avg_log2FC >= log2(fc) & p_val_adj < pval, 'UP',
ifelse(avg_log2FC <= -log2(fc) & p_val_adj < pval, 'DN',
'no_sig')))
mks$DE = factor(mks$DE, levels = c('UP','DN','no_sig'))
mks$gene = rownames(mks)
mks =mks %>% mutate(labels= ifelse(DE == 'UP', gene, ifelse(DE=='DN',gene,'')))
mks =mks %>% arrange(desc(avg_log2FC))
return(mks)
}
# Application
deg.cluster0 = DEG.two_groups(obj.srt = obj.srt, cluster = 0)
deg.cluster1 = DEG.two_groups(obj.srt = obj.srt, cluster = 1)
deg.cluster2 = DEG.two_groups(obj.srt = obj.srt, cluster = 2)
# Volcano plot
input.data = deg.cluster0
# Draw plot
input.data %>%
ggplot(aes(avg_log2FC, -log10(p_val_adj), color=DE)) +
geom_point(size=1, alpha=0.5) +
scale_color_manual(values = c('red','blue','grey')) +
theme_classic() +
geom_vline(xintercept = c(-log2(fc),log2(fc)), color='grey') +
geom_hline(yintercept = -log10(0.05),color='grey') +
guides(colour = guide_legend(override.aes = list(size=5))) +
ggtitle(paste0("Plot Title here")) +
geom_text(aes(label = labels), size = 2.5,
show.legend = FALSE, hjust = 0, nudge_x = 0.01) +
ggeasy::easy_center_title()
This analysis uses hallmark pathways for the analaysis.
library(clusterProfiler)
hallmark <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>%
dplyr::select(gs_name, gene_symbol)
perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
ranking <- function(res) {
df <- res$avg_log2FC
names(df) <- rownames(res)
df <- sort(df, decreasing = TRUE)
return(df)
}
ranked.res <- ranking(res)
x <- clusterProfiler::GSEA(geneList = ranked.res,
TERM2GENE = ref,
pvalueCutoff = pvalueCutoff,
pAdjustMethod = "BH",
verbose = TRUE,
seed = TRUE)
result <- x@result %>% arrange(desc(NES))
result <- result[, c('NES', 'pvalue', 'p.adjust', 'core_enrichment', 'ID')]
return(result)
}
# Application
gsea.res = perform_GSEA(res = deg.cluster3, ref = hallmark)
# GESA Plot
gsea_nes_plot =function(gsea.res, title){
gsea.res %>% ggplot(aes(reorder(ID, NES), NES)) +
geom_col(aes(fill=p.adjust)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title= "GSEA") +
theme_classic() +
scale_fill_gradient(low = 'red', high = '#E5E7E9') +
theme(axis.text.x= element_text(size=5, face = 'bold'),
axis.text.y= element_text(size=6, face = 'bold'),
axis.title =element_text(size=10)) +ggtitle(title)
}
# Application
gsea_nes_plot(gsea.res[gsea.res$p.adjust <= 0.05,], title = "Cluster3")
id1 = "GOBP_T_CELL_ACTIVATION" # Selected pathway"
enrichplot::gseaplot2(gsea.res, geneSetID = id1, title = id1)
Trajectory analysis infers developmental or activation paths across
cells. Monocle3 works natively with Seurat objects via
SeuratWrappers.
Reference: https://cole-trapnell-lab.github.io/monocle3/
library(monocle3)
library(SeuratWrappers)
# Convert Seurat object to CellDataSet
cds <- as.cell_data_set(obj.srt)
# Pre-process (skip if already done in Seurat)
cds <- preprocess_cds(cds, num_dim = 50)
# Transfer UMAP coordinates from Seurat
reducedDim(cds, type = "UMAP") <- obj.srt@reductions$umap@cell.embeddings[, 1:2]
cds@int_colData@listData$reducedDims$UMAP <- obj.srt@reductions$umap@cell.embeddings[, 1:2]
# Cluster and learn trajectory graph
cds <- cluster_cells(cds)
cds <- learn_graph(cds)
# Order cells along trajectory
# Use order_cells(cds) interactively, or specify root cells:
# cds <- order_cells(cds, root_cells = root_cell_ids)
cds <- order_cells(cds)
# Visualize pseudotime
plot_cells(cds,
color_cells_by = "pseudotime",
label_cell_groups = FALSE,
label_leaves = FALSE,
label_branch_points = FALSE,
graph_label_size = 1.5)
MAGIC (Cell, 2018)
https://github.com/KrishnaswamyLab/MAGIC
Markov Affinity-based Graph Imputation of Cells (MAGIC) is an algorithm
for denoising high-dimensional data most commonly applied to single-cell
RNA sequencing data. MAGIC learns the manifold data, using the resultant
graph to smooth the features and restore the structure of the
data.
# Import the necessary libraries.
import pandas as pd
import magic
import csv
# Load the CSV file into a DataFrame.
data_path = '~/path/data.csv'
x = pd.read_csv(data_path)
# Remove the 'Unnamed: 0'
del x['Unnamed: 0']
# Initialize the MAGIC algorithm. the number of nearest neighbors (knn) is set to 5, and the diffusion time (t) is set to 3.
magic_operator = magic.MAGIC(knn=5, t=3)
# Apply the MAGIC algorithm to the data.
x_magic = magic_operator.fit_transform(x)
# Save the data to a CSV file.
output_path = '~/path/data.magic.csv'
x_magic.to_csv(output_path)