RNA-seq data preparation
Import
TCGA_BRCA_countsMeta = readRDS(paste0(dir,"rds/TCGA_BRCA_countsMeta.rds"))
treament = TCGA_BRCA_countsMeta$meta$treatments
Create DESeq object
count.raw = TCGA_BRCA_countsMeta$counts
# Remove the duplicated gene names
rs = rownames(count.raw)[!is.na(rownames(count.raw))]
count.mtx = count.raw[rs,]
# Remove genes with no expression across samples
count.mtx = count.mtx[rowSums(count.mtx) !=0,]
meta = TCGA_BRCA_countsMeta$meta
meta = meta[colnames(count.mtx),]
library(SummarizedExperiment)
library(DESeq2)
se <- SummarizedExperiment(as.matrix(count.mtx),
colData=meta)
dds <- DESeqDataSet(se, ~ 1)
vsd <- vst(dds, blind=FALSE)
pcaData <- DESeq2::plotPCA(vsd, intgroup = "sample_type", returnData = TRUE)
pcaData$group = dds$group
PCA_var=attr(pcaData, "percentVar")
PCA plot
ggplot(pcaData, aes(x = PC1, y = PC2, fill = sample_type)) +
geom_point(size = 2, alpha = 0.8, shape = 21, color = "black", stroke = 0.2) +
# ggrepel::geom_text_repel(aes(label=name),
# color="grey6", size=3, hjust= -0.3, vjust=-0.3) +
labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
theme_bw() +
theme(legend.title = element_blank()) +
ggtitle("PCA") +
labs(caption = " ")

PCA plot with sample info
# sample_type의 갯수 계산
sample_type_counts <- table(pcaData$sample_type)
# 플롯 생성
p <- ggplot(pcaData, aes(x = PC1, y = PC2, fill = sample_type)) +
geom_point(size = 2, alpha = 0.8, shape = 21, color = "black", stroke = 0.2) +
# ggrepel::geom_text_repel(aes(label = name),
# color = "grey6", size = 3, hjust = -0.3, vjust = -0.3) +
labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
theme_bw() +
theme(legend.title = element_blank()) +
ggtitle("PCA") +
labs(caption = " ")
# sample_type 갯수 추가
count_text <- paste(names(sample_type_counts), sample_type_counts, sep = ": ", collapse = "\n")
# 플롯에 텍스트 추가
p + annotate("text", x = Inf, y = Inf, label = count_text, hjust = 1, vjust = 1, size = 3, color = "black", angle = 0, fontface = "bold")

TSNE Plot of samples
Processing
- Based on the count data (raw read)
library(Rtsne)
# Remove the duplicated rows(genes)
unique_data <- count.mtx[!duplicated(count.mtx), ]
# run t-SNE
tsne_result <- Rtsne(t(unique_data), dims = 2, perplexity = 3, verbose = TRUE, max_iter = 100)
## Performing PCA
## Read the 1224 x 50 data matrix successfully!
## Using no_dims = 2, perplexity = 3.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.03 seconds (sparsity = 0.011850)!
## Learning embedding...
## Iteration 50: error is 96.928521 (50 iterations in 0.09 seconds)
## Iteration 100: error is 87.008262 (50 iterations in 0.05 seconds)
## Fitting performed in 0.14 seconds.
TSNE plot
tsne_data <- data.frame(X = tsne_result$Y[,1], Y = tsne_result$Y[,2])
ggplot(tsne_data, aes(x = X, y = Y)) +
geom_point(size = 1, alpha = 0.8, shape = 21, color = "black", stroke = 0.2) +
ggtitle("t-SNE Plot") +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
theme_minimal()

TSNE plot by tumor type
tsne_data = tsne_result$Y
rownames(tsne_data) = colnames(unique_data)
tsne_data = cbind(tsne_data,as.data.frame(TCGA_BRCA_countsMeta$meta[rownames(tsne_data),]))
colnames(tsne_data)[1:2] = c(paste0("TSNE", 1:2))
ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, fill=definition)) +
geom_point(size = 2, alpha = 0.8, shape=21, color = "black", stroke = 0.2) +
ggtitle("t-SNE Plot") +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
labs(fill = NULL) +
theme_minimal()

TSNE plot by pathologic_stage
ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, fill=ajcc_pathologic_stage)) +
geom_point(size = 2, alpha = 0.8, shape=21, color = "black", stroke = 0.2) +
ggtitle("t-SNE Plot") +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
labs(fill = NULL) +
theme_minimal()

TSNE plot by BRCA PAM50 subtype
ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, fill=paper_BRCA_Subtype_PAM50)) +
geom_point(size = 2, alpha = 0.8, shape=21, color = "black", stroke = 0.2) +
ggtitle("t-SNE Plot") +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
labs(fill = NULL) +
theme_minimal()

ggplot(tsne_data, aes(x = TSNE1, y = TSNE2, fill=paper_BRCA_Subtype_PAM50)) +
geom_point(size = 2, alpha = 0.8, shape=21, color = "black", stroke = 0.2) +
ggtitle("t-SNE Plot") +
xlab("t-SNE 1") +
ylab("t-SNE 2") +
labs(fill = NULL) +
theme_minimal() +
facet_wrap(.~paper_BRCA_Subtype_PAM50)
