CEL文件:探针的信号值和定位信息,是Affymetrix公司的芯片原始数据。
CEL files contain information on the probe set's intensity values, and a probe set represents a gene. Information about probes gets extracted from the image data by Affymetrix, an image analysis software.
代码语言:shell复制library(affy)
library(limma)
library(stringr)
library(AnnoProbe)
library(magrittr)
0. 下载原始 数据
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE66229
代码语言:shell复制mkdir data
cd data
tar -xf GSE66229.TAR
rm GSE66229.TAR
1. 读取原始数据
代码语言:text复制rawdata <- affy::ReadAffy(celfile.path = "data")
2. rma标准化
代码语言:text复制rawdata %<>% affy::rma()
exprs <- Biobase::exprs(rawdata)
range(exprs, na.rm = TRUE) # 1.889917 14.620563 不超过50不需要log2转化
# 列重命名
colnames(exprs) <- stringr::str_split(string=colnames(exprs),pattern = "_", simplify = T)[, 1]
3. limma标准化
代码语言:text复制boxplot(exprs, outline = FALSE, notch = FALSE, las = 2)
#分位数校正
exprs %<>% limma::normalizeBetweenArrays()
boxplot(exprs, outline = FALSE, notch = FALSE, las = 2)
range(exprs, na.rm = TRUE) # 2.09520 14.30741
exprs %<>% as_tibble(rownames = "probe_id")
4. 获取探针注释文件
代码语言:text复制# 得到探针对应的基因名字
probe2Symbol <- AnnoProbe::idmap("GPL570")
# 看看前几行
head(probe2Symbol,3)
# probe_id symbol
# 193731 1053_at RFC2
# 193732 117_at HSPA6
# 193733 121_at PAX8
5.ID转换
代码语言:text复制transid <- function(probe2Symbol, exprs, method = "median") {
probe2Symbol$probe_id %<>% as.character()
exprs$probe_id %<>% as.character()
exprs %>%
dplyr::inner_join(probe2Symbol, by = "probe_id") %>%
dplyr::select(-probe_id) %>%
dplyr::select(symbol, everything()) %>%
dplyr::mutate(ref = apply(across(where(is.numeric)), 1, method)) %>%
dplyr::arrange(desc(ref)) %>%
dplyr::select(-ref) %>%
dplyr::distinct(symbol, .keep_all = TRUE)
}
expression <- transid(probe2Symbol, exprs, method = "median")