2024 ADCs testing data
bulk RNA-seq
Sample data matrix
Elastic Net : Important features selection
- Data Normalization: z-scoring
- Label: sample name
- alpha: 0.05 (L1:0.05, L2:0.95)
library(glmnet)
# Data normalization first
z_score_tpm <- t(apply(tpm, 1, function(x) (x - mean(x)) / sd(x)))
# z_score_tpm <- na.omit(z_score_tpm) # NA handling
# Set input values
x <- t(as.matrix(z_score_tpm))
# y <- colnames(tpm)
y <- c(rep("CTRL", 3),
rep("CND1", 3),
rep("CND2", 3),
rep("CND3", 3))
# y를 수치형으로 변환
y <- as.numeric(as.factor(y)) - 1 # 팩터를 수치형으로 변환하고, R의 팩터 인덱싱(1 시작)을 조정
# alpha 값을 다양하게 설정하여 모델 적합
alpha_values <- c(0, 0.05, 0.1, 0.2, 0.25, 0.5, 0.75, 1) # 다양한 alpha 값
models <- list()
for (alpha in alpha_values) {
set.seed(123)
cv_model <- cv.glmnet(x, y, alpha=alpha, family="gaussian", grouped = FALSE)
# best_lambda <- cv_model$lambda.min
best_lambda <- cv_model$lambda.1se
model <- glmnet(x, y, alpha=alpha, lambda=best_lambda)
models[[as.character(alpha)]] <- model
}
# each alpha generates the following number of coef
sapply(models, function(model) length(which(coef(model)[-1] != 0)))
## 0 0.05 0.1 0.2 0.25 0.5 0.75 1
## 22845 112 88 1 0 4 6 9
# best lambda by cross validation
# n is the selected alpha value (now fixed) : 0.05
n= 0.05
set.seed(123)
cv_model <- cv.glmnet(x, y, alpha=n, family="gaussian")
# best_lambda <- cv_model$lambda.min
best_lambda <- cv_model$lambda.min
final_model <- glmnet(x, y, alpha=n, lambda=best_lambda)
# final_model
# coef(final_model)
# Select the important genes
selected_genes <- which(coef(final_model)[-1] != 0) # first == intercept
rownames(tpm)[selected_genes]
## [1] "LASP1" "CYP26B1" "CD79B" "UBE3C" "FOXP3"
## [6] "LAMC3" "TRAM1" "DGCR2" "RDH11" "ZFYVE26"
## [11] "MRPS34" "PLD1" "DLG1" "MBNL3" "EDN1"
## [16] "CRYBG3" "SLC4A4" "LAMB1" "PITPNM3" "CD200"
## [21] "SUSD2" "HIRA" "CSF2RB" "CTSG" "PROCR"
## [26] "SIRPB1" "CEMIP" "CSPP1" "ASAH1" "SLC17A7"
## [31] "TAX1BP1" "COBL" "AGR2" "RASSF4" "EZH1"
## [36] "AREG" "CPE" "RPL34" "CBL" "POU2AF1"
## [41] "SELPLG" "TSPAN11" "WNT5A" "ATP6V1A" "ITGB6"
## [46] "EFHD1" "HPCAL1" "QPCT" "PNO1" "RALGPS2"
## [51] "STXBP3" "CD46" "CDC20" "UTP25" "VPS4B"
## [56] "EPCAM" "GLIPR2" "ACO1" "ARHGAP9" "FAM124B"
## [61] "SRSF6" "USP22" "RPS10" "CXCL6" "MIR9-1HG"
## [66] "CD93" "BMP2" "TRAP1" "FGFRL1" "VGF"
## [71] "MIS18BP1" "RNASE1" "LSP1" "ARHGEF16" "PRRG1"
## [76] "ADGRE3" "GFPT2" "VSTM2L" "KIDINS220" "PRPF38A"
## [81] "C5AR2" "APC" "GOLM1" "MICAL1" "GNS"
## [86] "STAM" "PIM1" "MRPL15" "CH25H" "RDH16"
## [91] "ZNF740" "FGF7" "GCNT3" "DHX38" "GAREM1"
## [96] "SIK1" "EPHA2" "RGS5" "MINDY1" "VASH2"
## [101] "COL8A1" "OSMR" "TMEM47" "ARHGAP39" "SURF4"
## [106] "SIDT2" "FAM124A" "TDO2" "TGOLN2" "SPARCL1"
## [111] "BANK1" "PDIA4" "WHAMM" "CD109" "EXOG"
## [116] "WASF2" "EIF5B" "FCER1G" "STC1" "TNFRSF13C"
## [121] "VAV2" "TONSL" "PLPP3" "C1orf74" "NUP35"
## [126] "SGPP2" "INHBB" "ALPP" "TGFBR2" "NUAK2"
## [131] "RPL22L1" "SCGB3A2" "GALNT10" "ZNF704" "CTHRC1"
## [136] "DCHS1" "GARRE1" "PPIB" "EVA1C" "FN3K"
## [141] "NFKBID" "KRT80" "CDC40" "LRATD2" "DHRSX"
## [146] "FASN" "CSGALNACT2" "ARL6IP1" "HAS2" "HS6ST2"
## [151] "SYNPO2" "CKS1B" "OLR1" "ZDHHC14" "YPEL2"
## [156] "NRIP3" "CLCF1" "GNG7" "WSB2" "APOLD1"
## [161] "C5orf24" "GPR39" "APOBR" "RPS27L" "TCEAL9"
## [166] "GNB1L" "EVI2B" "RPS23" "SELL" "NBR1"
## [171] "PAX5" "FAT4" "CTSE" "PIK3R4" "AJAP1"
## [176] "HNRNPAB" "MYO5A" "S100A10" "CAPN8" "SAMD5"
## [181] "NHSL2" "HLA-DRA" "NEU1" "LGR4" "TMSB4X"
## [186] "IGLV3-21" "IGHG2" "IGHG3" "IGHV1-2" "IGHV3-11"
## [191] "IGHV3-21" "IGHV4-39" "NEURL1B" "LINC01615" "ZNF844"
## [196] "C14orf132" "LTB" "HSBP1" "TRIM26" "NFAM1"
## [201] "ZNF853" "PSMB9" "PLEKHO2" "MICAL3" "ECSCR"
## [206] "IGLV2-5" "ITGB3" "TMEM263-DT" "SPON1" "NCOA4"
## [211] "CCL5" "IGHV7-4-1" "MSANTD7"
# 선택된 유전자들만 포함한 새로운 데이터셋 생성
reduced_data <- x[, selected_genes]
# dim(reduced_data) # 축소된 데이터의 차원 확인
##
## Call: glmnet(x = x, y = y, alpha = n, lambda = best_lambda)
##
## Df %Dev Lambda
## 1 213 69.93 10.43
Other alpha value test
Define Function
elasticNetModel <- function(x, y, alpha_value = 0.05) {
set.seed(123)
# 교차 검증을 사용하여 최적의 lambda 값을 찾습니다.
cv_model <- cv.glmnet(x, y, alpha = alpha_value, family = "gaussian")
best_lambda <- cv_model$lambda.min
# 최적의 lambda를 사용하여 모델을 학습합니다.
final_model <- glmnet(x, y, alpha = alpha_value, lambda = best_lambda)
# 계수 확인 및 중요 유전자 선택
model_coefs <- coef(final_model)
selected_genes_indices <- which(model_coefs[-1] != 0) # 첫 번째 요소(절편) 제외
important_genes <- colnames(x)[selected_genes_indices]
# 선택된 유전자들만 포함한 새로운 데이터셋 생성
reduced_data <- x[, selected_genes_indices]
# 결과를 리스트로 반환
return(list(
final_model = final_model,
important_genes = important_genes,
reduced_data = reduced_data
))
}
Set Input matrix and Lable
Elastic Net (alpha:0.1)
##
## Call: glmnet(x = x, y = y, alpha = alpha_value, lambda = best_lambda)
##
## Df %Dev Lambda
## 1 119 63.31 5.996
Elastic Net (alpha:0.5)
##
## Call: glmnet(x = x, y = y, alpha = alpha_value, lambda = best_lambda)
##
## Df %Dev Lambda
## 1 18 56.76 1.379
my.color=c(colorRampPalette(colors = c("navy","white"))(20),
colorRampPalette(colors = c("white","salmon"))(23))
t(results$reduced_data) %>%
pheatmap::pheatmap(cluster_cols = F,
gaps_col = c(3,6,9),
fontsize_row = 7,
color = my.color)
Note No prediction in this workflow.