往期推薦
####----11_Lasso_Regression----####
rm(list = ls())
gc()
load('TCGA_STAD_data.RData')
library(dplyr)
library(tidyverse)
library(ggplot2)
library(survival)
library(glmnet) # 正則化技術(shù)
library(car) # 多重共線性檢驗(yàn)
library(corrplot) # 相關(guān)性繪圖
library(ggpubr) #基于ggplot2的可視化包ggpubr
library(patchwork) # 拼圖
library(ROCR) # 畫ROC曲線
library(caret) # 切割數(shù)據(jù)
Data <- data[,c(6,12:30917)]
Data$Status = ifelse(Data$Status =='Dead','1','0')
Data <- as.data.frame(lapply(Data,as.numeric))
### 數(shù)據(jù)檢查
min(Data$month) # 查看最小值,看是否有0
Data2 <- filter(Data, Data$month > 0) # 刪除數(shù)據(jù)框中time值小于0的樣本,有0的話后續(xù)分析會(huì)報(bào)錯(cuò)
dataL <- Data2 # 數(shù)據(jù)備份
hist(dataL$FBLN5, main = 'Distribution of gene expression levels',
xlab = 'FBLN5', ylab = 'gene expression', col = 'skyblue') # 直方圖
curve(dnorm(x, mean = mean(dataL$FBLN5), sd = sd(dataL$FBLN5)), add = TRUE, col = 'red') # 正態(tài)分布的概率密度函數(shù)
# shapiro-wilk檢驗(yàn)
shapiro.test(dataL$FBLN5) # 適用于小樣本數(shù)據(jù),N≤50,若p值<0.05,則數(shù)據(jù)不符合正態(tài)分布
# Kolmogorov-Smirnov檢驗(yàn)
ks.test(dataL$FBLN5, pnorm, mean = mean(dataL$FBLN5), sd = sd(dataL$FBLN5)) # 適用于大樣本數(shù)據(jù),N>50,若p值<0.05,則數(shù)據(jù)不符合正態(tài)分布
# 對(duì)所有列的數(shù)據(jù)進(jìn)行Kolmogorov-Smirnov循環(huán)檢驗(yàn)
ks_result <- data.frame('Gene' = character(), 'p.value' = numeric())
for (i in 3:length(colnames(dataL))){
p.value <- ks.test(dataL[,i], pnorm, mean = mean(dataL[,i]), sd = sd(dataL[,i]))$p.value
Gene <- colnames(dataL)[i]
tmp <- data.frame(Gene = Gene,
p.value = p.value)
ks_result <- rbind(ks_result, tmp)
}
max(ks_result$p.value) # 若p值<0.05,則數(shù)據(jù)不符合正態(tài)分布
cor_matrix <- cor(dataL[,3:12], use = 'everything', method = c('spearman'))
corrplot(cor_matrix, type = 'upper', method = 'color', tl.cex = 0.8, tl.col = 'black', order = 'AOE')
dataL2 <- dataL[,1:12]
model <- coxph(Surv(month, Status) ~ ., data = dataL2)
vif(model)
plot(vif(model),
xlim = c(0,10), ylim = c(0,10),
xlab = 'variables', ylab = 'VIF',
cex = 3, pch = 1)
# 當(dāng)VIF大于5時(shí),認(rèn)為存在多重共線性,當(dāng)VIF大于10時(shí),認(rèn)為存在嚴(yán)重的多重共線性
time <- dataL[,'month']status <- dataL[,'Status']x <- as.matrix(dataL[,-c(1,2)]) #x為輸入特征,是矩陣格式y(tǒng) <- as.matrix(dataL$Status)lasso <- glmnet(x = x, y = y, family = 'binomial', alpha = 1, # alpha = 1為L(zhǎng)ASSO回歸,= 0為嶺回歸,0和1之間則為彈性網(wǎng)絡(luò) nlambda = 100) # nlambda表示正則化路徑中的個(gè)數(shù),這個(gè)參數(shù)就可以起到一個(gè)閾值的作用,決定有多少基因的系數(shù)可以留下來(lái)。默認(rèn)值為100。print(lasso)plot(lasso, xvar = 'lambda', label = TRUE) # 系數(shù)分布圖,由“l(fā)og-lambda”與變量系數(shù)作圖,展示根據(jù)lambda的變化情況每一個(gè)特征的系數(shù)變化,展示了Lasso回歸篩選變量的動(dòng)態(tài)過(guò)程plot(lasso, xvar = 'dev', label = TRUE) # 也可以對(duì)%dev繪圖plot(lasso, xvar = 'norm', label = TRUE) # “L1范數(shù)”與變量系數(shù)作圖
set.seed(1234) # 設(shè)置種子數(shù),保證每次得到的交叉驗(yàn)證圖是一致的
# 計(jì)算100個(gè)λ,畫圖,篩選表現(xiàn)最好的λ值
lasso_cv <- cv.glmnet(x = x, y = y, family = 'binomial', alpha = 1, nlambda = 100) # 交叉驗(yàn)證,如果結(jié)果不理想,可以重新單獨(dú)運(yùn)行這一行代碼,或者換一下種子數(shù)
plot(lasso_cv) # 縱坐標(biāo):部分似然偏差。上橫坐標(biāo):當(dāng)前的變量數(shù)。下橫坐標(biāo):log(λ)
coef_lasso <- coef(lasso, s = 0.1) # 查看每個(gè)變量的回歸系數(shù)。s為lasso回歸的結(jié)果中的lambda值,選取相應(yīng)的λ值可以得到相應(yīng)數(shù)量的變量的回歸系數(shù)
coef_lasso
set.seed(1234) # 設(shè)置種子數(shù),保證每次得到的交叉驗(yàn)證圖是一致的
# 計(jì)算100個(gè)λ,畫圖,篩選表現(xiàn)最好的λ值
lasso_cv <- cv.glmnet(x = x, y = y, family = 'binomial', alpha = 1, nlambda = 100) # 交叉驗(yàn)證,如果結(jié)果不理想,可以重新單獨(dú)運(yùn)行這一行代碼,或者換一下種子數(shù)
plot(lasso_cv) # 縱坐標(biāo):部分似然偏差。上橫坐標(biāo):當(dāng)前的變量數(shù)。下橫坐標(biāo):log(λ)
lambda <- lasso_cv$lambda.min
# lambda <- lasso_cv$lambda.1se
lambda
coef_lasso_cv <- coef(lasso, s = lambda) # 查看每個(gè)變量的回歸系數(shù)。s為lasso回歸結(jié)果中的lambda值,可以得到相應(yīng)數(shù)量的變量的回歸系數(shù)
coef_lasso_cv[,1][coef_lasso_cv[,1]!=0]
exp(coef_lasso_cv[,1][coef_lasso_cv[,1]!=0]) # 自然對(duì)數(shù)的指數(shù)函數(shù),它的定義為exp(x) = e^x,其中e是自然對(duì)數(shù)的底數(shù)(約等于2.718),回歸系數(shù)<0的exp>1,反之<1。
model_lasso_min <- glmnet(x = x, y = y, alpha = 1, lambda = lasso_cv$lambda.min)
# model_lasso_1se <- glmnet(x = x, y = y, alpha = 1, lambda = lasso_cv$lambda.1se)
# 這兩個(gè)值體現(xiàn)在參數(shù)lambda上。有了模型,可以將篩選的基因挑出來(lái)了。
# 所有基因存放于model_lasso_min模型的子集beta中,用到的基因有一個(gè)s0值,沒(méi)用的基因只記錄了“.”,所以可以用下面代碼挑出用到的基因。
model_lasso_min$beta[,1][model_lasso_min$beta[,1]!=0]
choose_gene_min = rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta) != 0]
# choose_gene_1se = rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta) != 0]
length(choose_gene_min)
# length(choose_gene_1se)
DCBLD2 NECAB2 ECH1 SERPINE1
0.0001176182 0.0050609354 0.0013001252 0.0076438492
PAPPA2 EGF IGFBP1 GPC3
0.0058767537 0.0017086554 0.0018483925 0.0031675436
SYN2 PRTG H3C14 U3
0.0046942426 0.0172503772 -0.0108994643 -0.0100155497
CTAGE11P AL355001.1 AL022316.1 PRDX3P2
0.0059144276 -0.0078457384 -0.0172823521 -0.0105412767
MTHFD1P1 AC002480.1 SCAT8 PWP2
-0.0035888050 0.0047360137 0.0010923467 0.0090971061
PTPRJ.AS1 HPR AL049839.2 SOWAHCP2
-0.0011330955 0.0002045581 0.0045504295 -0.0013408436
AC016583.2
0.0054207717
主要參考資料來(lái)自博客園,作者小高不高,鏈接:
https://www.cnblogs.com/xiaogaobugao/p/17131487.html。
聯(lián)系客服