QC Processing

Raw data processing

cellranger :

10x Genomics’s Chromium system to process single cell RNA-seq raw data.
Easy to use but need large computing resources and Linux environment.
Input : Fastq files, cellranger genome index

cellranger count/aggr

# cellranger count for transcriptome data processing 
cellranger count --id=SampleA_GEX \
--transcriptome=/path/to/reference/transcriptomes/Mouse_GEX_2020/refdata-gex-mm10-2020-A \
--fastqs=/path/to/raw_data/SampleA \
--sample=SampleA \
--expect-cells=10000

# cellranger aggr to aggregate multiple cellranger count output to one file 
cellranger aggr --id=Aggregate_GEX \
--csv=/path/to/aggregate_info/Aggregate_GEX.csv

Aggregate_GEX.csv example

library_id,molecule_h5
SampleA,/path/to/SampleA/outs/molecule_info.h5
SampleB,/path/to/SampleB/outs/molecule_info.h5
SampleC,/path/to/SampleC/outs/molecule_info.h5

Output

Aggregated Feature-Barcode Matrices

matrix.mtx
genes.tsv
barcodes.tsv

Read Cellranger output in R

library(Seurat)
obj.raw = Read10X('raw_data/Path/filtered_feature_bc_matrix/')
obj.srt = CreateSeuratObject(counts = obj.raw, project = 'project_name')

Doublet removal (by scrublet)

scrublet runs in python env.
Input file : count matrix from seurat object. Run scrublet in conda environment.

conda environment

# Create and activate a new conda environment
conda create -n bioinfo_env python=3.8 -y
conda activate bioinfo_env

# Install pip if not already installed
conda install pip -y

# Install necessary packages with pip
pip install numpy pandas scanpy scrublet

# run python
python

Run python script for scrublet

import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr

def run_scrublet(input_file, output_file, expected_doublet_rate=0.1):
    df = pd.read_csv(input_file, header=0, index_col=0)
    adata = sc.AnnData(df)
    
    # Set the expected_doublet_rate parameter
    sc.external.pp.scrublet(adata, expected_doublet_rate=expected_doublet_rate)
    
    # Save the observation (results) to CSV
    adata.obs.to_csv(output_file)

input_file = 'input.count.csv'
output_file = 'output.scr.csv'

# Adjust the expected_doublet_rate parameter
expected_doublet_rate = 0.1
run_scrublet(input_file, output_file, expected_doublet_rate)

Add the doublet info

Add the scrublet output information to the seurat object.

## import scrublet result
df= read.csv('path/output.scr.csv', row.names = 1)

# If the previous process changes the name of cell ID, 
# Use the following code to correct it. 
rownames(df) = gsub(pattern = '_', replacement = '-' ,rownames(df))

## add doublet info to the srt obj

obj.srt[['doublet_score']] = df[rownames(obj.srt@meta.data),]$doublet_score
obj.srt[['predicted_doublet']] =df[rownames(obj.srt@meta.data),]$predicted_doublet

obj.srt %>% saveRDS("path/saved.obj.rds")

# You might want to check the number of doublets in the data
obj.srt@meta.data %>% select(predicted_doublet) %>% table()

Filtering

Filtering by UMI & mitocondrial content ratio
3 versions of Filtering
- UMI >=1000 & mito % <=15
- UMI >=2000 & mito % <=10
- UMI >=5000 & mito % <=5

Example

Read the raw data

dir <- "path/to/cellrangeroutput/"
wt_numbers <- 1:3

# lapply > Seurat object
obj_list <- lapply(wt_numbers, function(i) {
  data_dir <- paste0(dir, "WT_", i, "/")
  obj.raw <- Read10X(data.dir = data_dir)
  obj.srt <- CreateSeuratObject(counts = obj.raw, project = paste0("WT", i))
  return(obj.srt)
})

# Name the list
names(obj_list) <- paste0("wt", wt_numbers)

# list
obj_list

# save the list of rds files 
dir <- "~/path/to/project/"
obj_list %>% saveRDS(paste0(dir, "rds/obj_all_list.rds"))

Add mitochondrial content info to the data

# Add mitochondrial percentage to meta data
# mm10 mitochondrial genes : ^mt-
# ref (https://www.michaelchimenti.com/2019/03/calculate-mitochondrial-for-mouse-scrna-seq/)
obj_list <- lapply(obj_list, function(obj) {
  obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^mt-")
  return(obj)
})

Plot the filtered samples

umi = 1000
mitoratio = 15

plots <- lapply(obj_list, function(obj) {
  count_filtered <- sum(obj@meta.data$nCount_RNA >= umi & obj@meta.data$percent.mt <= mitoratio)
  ggplot(obj@meta.data, aes(nCount_RNA, percent.mt)) + 
    geom_point(size=0.2) +
    theme_bw() +
    scale_x_log10() + 
    geom_vline(xintercept = umi, color="salmon") +
    geom_hline(yintercept = mitoratio, color="salmon") +
  ggtitle(obj@meta.data$orig.ident[1]) +
    labs(caption = paste0("UMI cutoff :", umi, " and mito % : ", mitoratio)) +
    geom_text(aes(x = Inf, y = Inf, label = paste("Count:", count_filtered)), 
              hjust = 1.1, vjust = 1.5, size = 3, color = "blue")
})

plot_grid(plotlist = plots, ncol = 3)

Subset samples

# subset
subset_list <- lapply(obj_list, function(obj) {
  subset_obj <- subset(obj, subset = nCount_RNA >= 1000 & percent.mt <= 15)
  return(subset_obj)
})

# merge 
obj.srt = merge(subset_list[[1]], 
                y = c(subset_list[[2]], subset_list[[3]]), 
                add.cell.ids = names(subset_list))

obj.srt <- JoinLayers(obj.srt)

# plot
obj.srt@meta.data %>% ggplot(aes(nCount_RNA, percent.mt)) +
  geom_point(size=0.2) +
  theme_bw() +
  facet_wrap(.~orig.ident, ncol = 3)