你还缺乳腺癌表达量数据集吗

2020-12-03 15:35:28 浏览数 (1)

生存分析你还是在TCGA吗?

最近有粉丝求助说他研究乳腺癌做了单细胞转录组数据,定位到了一个稀有细胞亚群,先看它感兴趣的亚群细胞特异性基因的临床意义,问我有没有除了TCGA数据库之外的其它数据库资源推荐。恰好我做这方面就顺手检索了一下,发现了 curatedBreastData 包,值得推荐!

安装和加载相信已经无需我多说了:

代码语言:javascript复制
BiocManager::install('curatedBreastData') 
# 这个包接近300M
library("curatedBreastData") 

包内置的表达矩阵数据集

代码语言:javascript复制
data(curatedBreastDataExprSetList); 
# 34个数据集
length(curatedBreastDataExprSetList)
names(curatedBreastDataExprSetList)

可以看到绝大部分的芯片都是affy公司的GPL96,就是hgu133plus2:

代码语言:javascript复制
> names(curatedBreastDataExprSetList)
 [1] "study_1379_GPL1223_all"                      "study_2034_GPL96_all"                       
 [3] "study_4913_GPL3558_all"                      "study_6577_GPL3883_all"                     
 [5] "study_9893_GPL5049_all"                      "study_12071_GPL5186_all"                    
 [7] "study_12093_GPL96_all"                       "study_16391_GPL570_all"                     
 [9] "study_16446_GPL570_all"                      "study_17705_GPL96_JBI_Tissue_BC_Tamoxifen"  
[11] "study_17705_GPL96_MDACC_Tissue_BC_Tamoxifen" "study_18728_GPL570_all"                     
[13] "study_19615_GPL570_all"                      "study_19697_GPL570_all"                     
[15] "study_20181_GPL96_all"                       "study_20194_GPL96_all"                      
[17] "study_21974_GPL6480_all"                     "study_21997_GPL1390_all"                    
[19] "study_21997_GPL5325_all"                     "study_21997_GPL7504_all"                    
[21] "study_22226_GPL1708_all"                     "study_22226_GPL4133_all"                    
[23] "study_22358_GPL5325_all"                     "study_23428_GPL5325_all"                    
[25] "study_25055_GPL96_MDACC_M"                   "study_25055_GPL96_MDACC_PERU"               
[27] "study_25065_GPL96_LBJ"                       "study_25065_GPL96_MDACC"                    
[29] "study_25065_GPL96_MDACC_MDA"                 "study_25065_GPL96_PERU"                     
[31] "study_25065_GPL96_Spain"                     "study_25065_GPL96_USO"                      
[33] "study_32646_GPL570_all"                      "study_33658_GPL570_all"   

每个数据集都是一个独立的ExpressionSet对象:

代码语言:javascript复制
> eset=curatedBreastDataExprSetList[[3]]
> head(pData(eset)[c(1:3), c(1:10)])
                  datasetName dbUniquePatientID study_ID.x patient_ID GEO_GSMID platform_ID
110388 study_4913_GPL3558_all               597       4913     110388    110388        3558
110392 study_4913_GPL3558_all               598       4913     110392    110392        3558
110394 study_4913_GPL3558_all               599       4913     110394    110394        3558
       GEO_platform_ID AE_platform_ID coordinating_GSE_series_GSMID original_study_ID
110388         GPL3558           <NA>                            NA         wsb 10167
110392         GPL3558           <NA>                            NA          wsb 1281
110394         GPL3558           <NA>                            NA          wsb 1319

这个ExpressionSet对象凡是搞GEO数据挖掘的应该是都没有问题的。

可以看到,每个数据集的独立对象里面其实是有该样品的表型信息,但是没有临床属性,它存储在另外一个对象。可以看到,34个数据集合起来是2719个样品,而且它们总共是139个临床属性啦。

探索这个临床属性的代码如下:

代码语言:javascript复制
data(clinicalData)
#look at some of the clinical variable name definitions
clinicalData$clinicalVarDef[c(1:2),]
#Check out the treatment information.
#just do first three patients
head(clinicalData$clinicalTable)[c(1:3),
                                 c(112:ncol(clinicalData$clinicalTable))]
#how many had chemotherapy?
numChemoPatients <- length(which(
  clinicalData$clinicalTable$chemotherapyClass==1))
#around 1500 had chemotherapy
numChemoPatients
#which patients specifically had a taxane chemotherapy?
numChemoTaxane <- length(which(clinicalData$clinicalTable$taxane==1))
numChemoTaxane
 
#how many had adjuvant therapy?
numAdjPatients <- length(which(
  clinicalData$clinicalTable$neoadjuvant_or_adjuvant=="adj"))
#over a 1000 had (documented) adjuvant therapy
numAdjPatients 

#how many patients have non-NA OS binary data?
length(which(!is.na(clinicalData$clinicalTable$OS)))
#how many have OS data in the more granular form of months until OS? 
#this variable includes studies that had a cieling for tracking OS
length(which(!is.na(clinicalData$clinicalTable$OS_months_or_MIN_months_of_OS)))
#how many patients have OS information that is definitively 
#followed up until their death (details on how studies collect OS data can be surprising!)
length(which(!is.na(clinicalData$clinicalTable$OS_up_until_death)))

这个包还提供了一个数据处理函数:The wrapper function processExpressionSetList() completes all these post-processing steps on a list of S4 ExpressionSet objects like the curatedBreastDataExprSetList.rda list provided in this package.

不过,我觉得没有不要使用它的函数了。

0 人点赞