2024 ADCs testing data

bulk RNA-seq

library(dplyr)
library(ggplot2)
library(DT)

Sample data matrix

Elastic Net : Important features selection

  • Data Normalization: z-scoring
  • Label: sample name
  • alpha: 0.05 (L1:0.05, L2:0.95)
library(glmnet)

# Data normalization first
z_score_tpm <- t(apply(tpm, 1, function(x) (x - mean(x)) / sd(x)))
# z_score_tpm <- na.omit(z_score_tpm)  # NA handling

# Set input values 
x <- t(as.matrix(z_score_tpm))
# y <- colnames(tpm)
y <- c(rep("CTRL", 3),
       rep("CND1", 3),
       rep("CND2", 3),
       rep("CND3", 3))

# y를 수치형으로 변환
y <- as.numeric(as.factor(y)) - 1  # 팩터를 수치형으로 변환하고, R의 팩터 인덱싱(1 시작)을 조정


# alpha 값을 다양하게 설정하여 모델 적합
alpha_values <- c(0, 0.05, 0.1, 0.2, 0.25, 0.5, 0.75, 1)  # 다양한 alpha 값
models <- list()
for (alpha in alpha_values) {
  set.seed(123)
  cv_model <- cv.glmnet(x, y, alpha=alpha, family="gaussian", grouped = FALSE)
  # best_lambda <- cv_model$lambda.min
  best_lambda <- cv_model$lambda.1se
  model <- glmnet(x, y, alpha=alpha, lambda=best_lambda)
  models[[as.character(alpha)]] <- model
}

# each alpha generates the following number of coef
sapply(models, function(model) length(which(coef(model)[-1] != 0)))
##     0  0.05   0.1   0.2  0.25   0.5  0.75     1 
## 22845   112    88     1     0     4     6     9
# best lambda by cross validation
# n is the selected alpha value (now fixed) : 0.05

n= 0.05
set.seed(123)
cv_model <- cv.glmnet(x, y, alpha=n, family="gaussian")

# best_lambda <- cv_model$lambda.min
best_lambda <- cv_model$lambda.min
final_model <- glmnet(x, y, alpha=n, lambda=best_lambda)

# final_model

# coef(final_model)

# Select the important genes 
selected_genes <- which(coef(final_model)[-1] != 0)  # first == intercept
rownames(tpm)[selected_genes]
##   [1] "LASP1"      "CYP26B1"    "CD79B"      "UBE3C"      "FOXP3"     
##   [6] "LAMC3"      "TRAM1"      "DGCR2"      "RDH11"      "ZFYVE26"   
##  [11] "MRPS34"     "PLD1"       "DLG1"       "MBNL3"      "EDN1"      
##  [16] "CRYBG3"     "SLC4A4"     "LAMB1"      "PITPNM3"    "CD200"     
##  [21] "SUSD2"      "HIRA"       "CSF2RB"     "CTSG"       "PROCR"     
##  [26] "SIRPB1"     "CEMIP"      "CSPP1"      "ASAH1"      "SLC17A7"   
##  [31] "TAX1BP1"    "COBL"       "AGR2"       "RASSF4"     "EZH1"      
##  [36] "AREG"       "CPE"        "RPL34"      "CBL"        "POU2AF1"   
##  [41] "SELPLG"     "TSPAN11"    "WNT5A"      "ATP6V1A"    "ITGB6"     
##  [46] "EFHD1"      "HPCAL1"     "QPCT"       "PNO1"       "RALGPS2"   
##  [51] "STXBP3"     "CD46"       "CDC20"      "UTP25"      "VPS4B"     
##  [56] "EPCAM"      "GLIPR2"     "ACO1"       "ARHGAP9"    "FAM124B"   
##  [61] "SRSF6"      "USP22"      "RPS10"      "CXCL6"      "MIR9-1HG"  
##  [66] "CD93"       "BMP2"       "TRAP1"      "FGFRL1"     "VGF"       
##  [71] "MIS18BP1"   "RNASE1"     "LSP1"       "ARHGEF16"   "PRRG1"     
##  [76] "ADGRE3"     "GFPT2"      "VSTM2L"     "KIDINS220"  "PRPF38A"   
##  [81] "C5AR2"      "APC"        "GOLM1"      "MICAL1"     "GNS"       
##  [86] "STAM"       "PIM1"       "MRPL15"     "CH25H"      "RDH16"     
##  [91] "ZNF740"     "FGF7"       "GCNT3"      "DHX38"      "GAREM1"    
##  [96] "SIK1"       "EPHA2"      "RGS5"       "MINDY1"     "VASH2"     
## [101] "COL8A1"     "OSMR"       "TMEM47"     "ARHGAP39"   "SURF4"     
## [106] "SIDT2"      "FAM124A"    "TDO2"       "TGOLN2"     "SPARCL1"   
## [111] "BANK1"      "PDIA4"      "WHAMM"      "CD109"      "EXOG"      
## [116] "WASF2"      "EIF5B"      "FCER1G"     "STC1"       "TNFRSF13C" 
## [121] "VAV2"       "TONSL"      "PLPP3"      "C1orf74"    "NUP35"     
## [126] "SGPP2"      "INHBB"      "ALPP"       "TGFBR2"     "NUAK2"     
## [131] "RPL22L1"    "SCGB3A2"    "GALNT10"    "ZNF704"     "CTHRC1"    
## [136] "DCHS1"      "GARRE1"     "PPIB"       "EVA1C"      "FN3K"      
## [141] "NFKBID"     "KRT80"      "CDC40"      "LRATD2"     "DHRSX"     
## [146] "FASN"       "CSGALNACT2" "ARL6IP1"    "HAS2"       "HS6ST2"    
## [151] "SYNPO2"     "CKS1B"      "OLR1"       "ZDHHC14"    "YPEL2"     
## [156] "NRIP3"      "CLCF1"      "GNG7"       "WSB2"       "APOLD1"    
## [161] "C5orf24"    "GPR39"      "APOBR"      "RPS27L"     "TCEAL9"    
## [166] "GNB1L"      "EVI2B"      "RPS23"      "SELL"       "NBR1"      
## [171] "PAX5"       "FAT4"       "CTSE"       "PIK3R4"     "AJAP1"     
## [176] "HNRNPAB"    "MYO5A"      "S100A10"    "CAPN8"      "SAMD5"     
## [181] "NHSL2"      "HLA-DRA"    "NEU1"       "LGR4"       "TMSB4X"    
## [186] "IGLV3-21"   "IGHG2"      "IGHG3"      "IGHV1-2"    "IGHV3-11"  
## [191] "IGHV3-21"   "IGHV4-39"   "NEURL1B"    "LINC01615"  "ZNF844"    
## [196] "C14orf132"  "LTB"        "HSBP1"      "TRIM26"     "NFAM1"     
## [201] "ZNF853"     "PSMB9"      "PLEKHO2"    "MICAL3"     "ECSCR"     
## [206] "IGLV2-5"    "ITGB3"      "TMEM263-DT" "SPON1"      "NCOA4"     
## [211] "CCL5"       "IGHV7-4-1"  "MSANTD7"
# 선택된 유전자들만 포함한 새로운 데이터셋 생성
reduced_data <- x[, selected_genes]
# dim(reduced_data)  # 축소된 데이터의 차원 확인
final_model %>% print()
## 
## Call:  glmnet(x = x, y = y, alpha = n, lambda = best_lambda) 
## 
##    Df  %Dev Lambda
## 1 213 69.93  10.43

Heatmap of zscaled tpm by the selected features

# Heatmap 
my.color=c(colorRampPalette(colors = c("navy","white"))(20),
           colorRampPalette(colors = c("white","salmon"))(20))
t(reduced_data) %>% 
  pheatmap::pheatmap(cluster_cols = F, 
                     gaps_col = c(3,6,9), 
                     fontsize_row = 5, 
                     color = my.color)

Other alpha value test

Define Function

elasticNetModel <- function(x, y, alpha_value = 0.05) {
  set.seed(123)
  
  # 교차 검증을 사용하여 최적의 lambda 값을 찾습니다.
  cv_model <- cv.glmnet(x, y, alpha = alpha_value, family = "gaussian")
  best_lambda <- cv_model$lambda.min
  
  # 최적의 lambda를 사용하여 모델을 학습합니다.
  final_model <- glmnet(x, y, alpha = alpha_value, lambda = best_lambda)

  # 계수 확인 및 중요 유전자 선택
  model_coefs <- coef(final_model)
  selected_genes_indices <- which(model_coefs[-1] != 0)  # 첫 번째 요소(절편) 제외
  important_genes <- colnames(x)[selected_genes_indices]
  
  # 선택된 유전자들만 포함한 새로운 데이터셋 생성
  reduced_data <- x[, selected_genes_indices]
  
  # 결과를 리스트로 반환
  return(list(
    final_model = final_model,
    important_genes = important_genes,
    reduced_data = reduced_data
  ))
}

Set Input matrix and Lable

# Set input values 
x <- t(as.matrix(z_score_tpm))
# y <- colnames(tpm)
y <- c(rep("CTRL", 3),
       rep("CND1", 3),
       rep("CND2", 3),
       rep("CND3", 3))

# y를 수치형으로 변환
y <- as.numeric(as.factor(y)) - 1  # 팩터를 수치형으로 변환하고, R의 팩터 인덱싱(1 시작)을 조정

Elastic Net (alpha:0.1)

results <- elasticNetModel(x = x, y = y, alpha_value = 0.1)

results$final_model %>% print()
## 
## Call:  glmnet(x = x, y = y, alpha = alpha_value, lambda = best_lambda) 
## 
##    Df  %Dev Lambda
## 1 119 63.31  5.996
my.color=c(colorRampPalette(colors = c("navy","white"))(20),
           colorRampPalette(colors = c("white","salmon"))(20))
t(results$reduced_data) %>% 
  pheatmap::pheatmap(cluster_cols = F, 
                     gaps_col = c(3,6,9), 
                     fontsize_row = 7, 
                     color = my.color)

# results$important_genes %>% length()
# # "EPCAM" %in% results$important_genes

Elastic Net (alpha:0.5)

results <- elasticNetModel(x = x, y = y, alpha_value = 0.5)

results$final_model %>% print()
## 
## Call:  glmnet(x = x, y = y, alpha = alpha_value, lambda = best_lambda) 
## 
##   Df  %Dev Lambda
## 1 18 56.76  1.379
my.color=c(colorRampPalette(colors = c("navy","white"))(20),
           colorRampPalette(colors = c("white","salmon"))(23))
t(results$reduced_data) %>% 
  pheatmap::pheatmap(cluster_cols = F, 
                     gaps_col = c(3,6,9), 
                     fontsize_row = 7, 
                     color = my.color)

Note No prediction in this workflow.