在生物学上,经常会遇到找control和treat的差异基因或者任意两个或者两个以上处理条件下,最差异的变化,比如我有这样一个数据,几千个细胞分为处理过的和没处理过的,然后通过拍照记录了他们的形态大小等几十个特征,我想知道哪个特征产生了最大的变化。
代码语言:javascript复制head(actin_vim_TC)
image.png
代码语言:javascript复制library(limma)
#分组
group<-as.factor(unlist(lapply(rownames(actin_vim_TC),function(x){strsplit(x,"\_")[[1]][2]})))
levels(group)[levels(group) %in% c(2:12)] <-"C"
levels(group)[levels(group) %in% c(3:23)] <-"T"
design <- model.matrix(~0 factor(group))
colnames(design)=levels(factor(group))
rownames(design)=rownames(actin_vim_TC)
design
contrast.matrix<-makeContrasts(paste0(unique(group),collapse = "-"),levels = design)
##step1
fit <- lmFit(t(actin_vim_TC),design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
##step3
DEfeature = topTable(fit2, coef=1, n=Inf)
DEfeature<-DEfeature[order(abs(DEfeature$logFC),decreasing = T),]
head(DEfeature)
前几个差异最大的feature如下
image.png
将所有细胞做PCA
代码语言:javascript复制#PCA
actin_vim.pca <- prcomp(as.matrix(actin_vim_TC)[,rownames(DEfeature)[1:20]], center = F,scale = F)
summary(actin_vim_TC)
str(actin_vim_TC)
summary(actin_vim.pca)
type_old<-as.factor(unlist(lapply(rownames(actin_vim.pca$x),function(x){strsplit(x,"\_")[[1]][2]})))
levels(type_old)[levels(type_old) %in% c(2:12) ]<- "C"
levels(type_old)[levels(type_old) %in% c(13:23) ]<- "T"
#plot
gg<-cbind(as.data.frame(type_old),as.data.frame(actin_vim.pca$x))
gg$type_old<-as.factor(gg$type_old)
ggplot(gg,aes(x = PC1,y = PC2,color = gg$type_old)) geom_point(alpha = 0.5)
image.png
可以明显看到两群细胞分为不同的分布方向,所以查看较大特征值和特征向量
代码语言:javascript复制#show the feature
library(factoextra)
# Visualize variable with cos2 >= 0.85
f<-fviz_pca_var(actin_vim.pca, select.var = list(cos2 = 0.85), repel=T, col.var = "cos2", geom.var = c("arrow", "text") )
f
image.png
代码语言:javascript复制#监督聚类
library(caret)
library(kernlab)
Data<-as.matrix(actin_vim_TC)[,rownames(DEfeature)[1:20]]
Data<-cbind(Data,as.data.frame(type_old))
intrain<-createDataPartition(Data$type_old,p = 0.75,list = F)
head(intrain)
training<-Data[intrain,]
testing<-Data[-intrain,]
modelFit<-train(type_old~.,data=training,method = "glm")
modelFit$finalModel
predictions <- predict(modelFit,newdata=testing)
predictions
confusionMatrix(predictions,testing$type)
image.png
正确率也有85%,看看ROC曲线
代码语言:javascript复制library(pROC)
Roc=roc(testing$type_old,factor(predictions,ordered = T))
Roc
plot(Roc, print.auc = TRUE, auc.polygon = TRUE, legacy.axes = TRUE,
grid = c(0.1, 0.2), grid.col = c("green", "red"), max.auc.polygon = TRUE,
auc.polygon.col = "skyblue", print.thres = TRUE, xlab = "specificity", ylab = "sensitivity",
main = "")
image.png
查看机器学习分群的feature重要性
代码语言:javascript复制importance <- varImp(modelFit, scale=FALSE)
# summarize importance
print(importance)
# plot importance
plot(importance)
image.png
我们可以看到三种方式的结果几乎是差不多的,说明差异最显著的feature是在不同的方法计算方式都是稳定的。