寻找差异的feature

2020-04-01 15:51:20 浏览数 (1)

在生物学上,经常会遇到找control和treat的差异基因或者任意两个或者两个以上处理条件下,最差异的变化,比如我有这样一个数据,几千个细胞分为处理过的和没处理过的,然后通过拍照记录了他们的形态大小等几十个特征,我想知道哪个特征产生了最大的变化。

代码语言:javascript复制
head(actin_vim_TC)

image.png

代码语言:javascript复制
library(limma)
#分组
group<-as.factor(unlist(lapply(rownames(actin_vim_TC),function(x){strsplit(x,"\_")[[1]][2]})))
levels(group)[levels(group) %in% c(2:12)] <-"C"
levels(group)[levels(group) %in% c(3:23)] <-"T"
design <- model.matrix(~0 factor(group))
colnames(design)=levels(factor(group))
rownames(design)=rownames(actin_vim_TC)
design
contrast.matrix<-makeContrasts(paste0(unique(group),collapse = "-"),levels = design)
##step1
fit <- lmFit(t(actin_vim_TC),design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix) 
fit2 <- eBayes(fit2)  
##step3
DEfeature = topTable(fit2, coef=1, n=Inf)
DEfeature<-DEfeature[order(abs(DEfeature$logFC),decreasing = T),]
head(DEfeature)

前几个差异最大的feature如下

image.png

将所有细胞做PCA

代码语言:javascript复制
#PCA
actin_vim.pca <- prcomp(as.matrix(actin_vim_TC)[,rownames(DEfeature)[1:20]], center = F,scale = F)
summary(actin_vim_TC)
str(actin_vim_TC)
summary(actin_vim.pca)
type_old<-as.factor(unlist(lapply(rownames(actin_vim.pca$x),function(x){strsplit(x,"\_")[[1]][2]})))
levels(type_old)[levels(type_old) %in% c(2:12) ]<- "C"
levels(type_old)[levels(type_old) %in% c(13:23) ]<- "T"
#plot
gg<-cbind(as.data.frame(type_old),as.data.frame(actin_vim.pca$x))
gg$type_old<-as.factor(gg$type_old)
ggplot(gg,aes(x = PC1,y = PC2,color = gg$type_old)) geom_point(alpha = 0.5)

image.png

可以明显看到两群细胞分为不同的分布方向,所以查看较大特征值和特征向量

代码语言:javascript复制
#show the feature
library(factoextra)
# Visualize variable with cos2 >= 0.85
f<-fviz_pca_var(actin_vim.pca, select.var = list(cos2 = 0.85), repel=T, col.var = "cos2", geom.var = c("arrow", "text") )
f

image.png

代码语言:javascript复制
#监督聚类
library(caret)
library(kernlab)
Data<-as.matrix(actin_vim_TC)[,rownames(DEfeature)[1:20]]
Data<-cbind(Data,as.data.frame(type_old))

intrain<-createDataPartition(Data$type_old,p = 0.75,list = F)
head(intrain)
training<-Data[intrain,]
testing<-Data[-intrain,]
modelFit<-train(type_old~.,data=training,method = "glm")
modelFit$finalModel
predictions <- predict(modelFit,newdata=testing)
predictions 
confusionMatrix(predictions,testing$type)

image.png

正确率也有85%,看看ROC曲线

代码语言:javascript复制
library(pROC)
Roc=roc(testing$type_old,factor(predictions,ordered = T))
Roc
plot(Roc, print.auc = TRUE, auc.polygon = TRUE, legacy.axes = TRUE, 
     grid = c(0.1, 0.2), grid.col = c("green", "red"), max.auc.polygon = TRUE,  
     auc.polygon.col = "skyblue", print.thres = TRUE, xlab = "specificity", ylab = "sensitivity",
     main = "")  

image.png

查看机器学习分群的feature重要性

代码语言:javascript复制
importance <- varImp(modelFit, scale=FALSE)
# summarize importance
print(importance)
# plot importance
plot(importance)

image.png

我们可以看到三种方式的结果几乎是差不多的,说明差异最显著的feature是在不同的方法计算方式都是稳定的。

0 人点赞