cellranger :
# cellranger count for transcriptome data processing
cellranger count --id=SampleA_GEX \
--transcriptome=/path/to/reference/transcriptomes/Mouse_GEX_2020/refdata-gex-mm10-2020-A \
--fastqs=/path/to/raw_data/SampleA \
--sample=SampleA \
--expect-cells=10000
# cellranger aggr to aggregate multiple cellranger count output to one file
cellranger aggr --id=Aggregate_GEX \
--csv=/path/to/aggregate_info/Aggregate_GEX.csv
library_id,molecule_h5
SampleA,/path/to/SampleA/outs/molecule_info.h5
SampleB,/path/to/SampleB/outs/molecule_info.h5
SampleC,/path/to/SampleC/outs/molecule_info.h5
Aggregated Feature-Barcode Matrices
library(Seurat)
obj.raw = Read10X('raw_data/Path/filtered_feature_bc_matrix/')
obj.srt = CreateSeuratObject(counts = obj.raw, project = 'project_name')
scrublet runs in python env.
Input file : count matrix from seurat object. Run scrublet in conda
environment.
# Create and activate a new conda environment
conda create -n bioinfo_env python=3.8 -y
conda activate bioinfo_env
# Install pip if not already installed
conda install pip -y
# Install necessary packages with pip
pip install numpy pandas scanpy scrublet
# run python
python
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
def run_scrublet(input_file, output_file, expected_doublet_rate=0.1):
df = pd.read_csv(input_file, header=0, index_col=0)
adata = sc.AnnData(df)
# Set the expected_doublet_rate parameter
sc.external.pp.scrublet(adata, expected_doublet_rate=expected_doublet_rate)
# Save the observation (results) to CSV
adata.obs.to_csv(output_file)
input_file = 'input.count.csv'
output_file = 'output.scr.csv'
# Adjust the expected_doublet_rate parameter
expected_doublet_rate = 0.1
run_scrublet(input_file, output_file, expected_doublet_rate)
Add the scrublet output information to the seurat object.
## import scrublet result
df= read.csv('path/output.scr.csv', row.names = 1)
# If the previous process changes the name of cell ID,
# Use the following code to correct it.
rownames(df) = gsub(pattern = '_', replacement = '-' ,rownames(df))
## add doublet info to the srt obj
obj.srt[['doublet_score']] = df[rownames(obj.srt@meta.data),]$doublet_score
obj.srt[['predicted_doublet']] =df[rownames(obj.srt@meta.data),]$predicted_doublet
obj.srt %>% saveRDS("path/saved.obj.rds")
# You might want to check the number of doublets in the data
obj.srt@meta.data %>% select(predicted_doublet) %>% table()
Filtering by UMI & mitocondrial content ratio
3 versions of Filtering
dir <- "path/to/cellrangeroutput/"
wt_numbers <- 1:3
# lapply > Seurat object
obj_list <- lapply(wt_numbers, function(i) {
data_dir <- paste0(dir, "WT_", i, "/")
obj.raw <- Read10X(data.dir = data_dir)
obj.srt <- CreateSeuratObject(counts = obj.raw, project = paste0("WT", i))
return(obj.srt)
})
# Name the list
names(obj_list) <- paste0("wt", wt_numbers)
# list
obj_list
# save the list of rds files
dir <- "~/path/to/project/"
obj_list %>% saveRDS(paste0(dir, "rds/obj_all_list.rds"))
# Add mitochondrial percentage to meta data
# mm10 mitochondrial genes : ^mt-
# ref (https://www.michaelchimenti.com/2019/03/calculate-mitochondrial-for-mouse-scrna-seq/)
obj_list <- lapply(obj_list, function(obj) {
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^mt-")
return(obj)
})
umi = 1000
mitoratio = 15
plots <- lapply(obj_list, function(obj) {
count_filtered <- sum(obj@meta.data$nCount_RNA >= umi & obj@meta.data$percent.mt <= mitoratio)
ggplot(obj@meta.data, aes(nCount_RNA, percent.mt)) +
geom_point(size=0.2) +
theme_bw() +
scale_x_log10() +
geom_vline(xintercept = umi, color="salmon") +
geom_hline(yintercept = mitoratio, color="salmon") +
ggtitle(obj@meta.data$orig.ident[1]) +
labs(caption = paste0("UMI cutoff :", umi, " and mito % : ", mitoratio)) +
geom_text(aes(x = Inf, y = Inf, label = paste("Count:", count_filtered)),
hjust = 1.1, vjust = 1.5, size = 3, color = "blue")
})
plot_grid(plotlist = plots, ncol = 3)
# subset
subset_list <- lapply(obj_list, function(obj) {
subset_obj <- subset(obj, subset = nCount_RNA >= 1000 & percent.mt <= 15)
return(subset_obj)
})
# merge
obj.srt = merge(subset_list[[1]],
y = c(subset_list[[2]], subset_list[[3]]),
add.cell.ids = names(subset_list))
obj.srt <- JoinLayers(obj.srt)
# plot
obj.srt@meta.data %>% ggplot(aes(nCount_RNA, percent.mt)) +
geom_point(size=0.2) +
theme_bw() +
facet_wrap(.~orig.ident, ncol = 3)