书接上文 全基因组关联分析(GWAS)学习笔记——3.1
参考资料
- https://github.com/MareesAT/GWA_tutorial/
- 全基因组关联分析学习资料(GWAS tutorial)
- 论文 A tutorial on conducting genome-wide association studies: Quality control and statistical analysis
继续对数据集进行质控
检查个体杂合性,去除杂合性偏离平均值3个标准差的个体
这一步突然多出来一个inversion.txt文件,怎么来的还不太清楚 使用到的命令是
代码语言:javascript复制plink --bfile HapMap_3_r3_9 --exclude inversion.txt --range --indep-pairwise 50 5 0.2 --out indepSNP
plink --bfile HapMap_3_r3_9 --extract indepSNP.prune.in --het --out R_check
对结果进行可视化
代码语言:javascript复制het <- read.table("R_check.het", head=TRUE)
head(het)
het$HET_RATE = (het$"N.NM." - het$"O.HOM.")/het$"N.NM."
hist(het$HET_RATE, xlab="Heterozygosity Rate", ylab="Frequency", main= "Heterozygosity Rate")
ggplot(het,aes(x=HET_RATE,y=..count..))
geom_histogram(bins=90,fill="orange")
theme_bw()
image.png 选择杂合性超过平均值3个标准差的个体
代码语言:javascript复制het_fail <- subset(het, (het$HET_RATE < mean(het$HET_RATE)-3*sd(het$HET_RATE)) | (het$HET_RATE > mean(het$HET_RATE) 3*sd(het$HET_RATE)));
het_fail$HET_DST <- (het_fail$HET_RATE-mean(het$HET_RATE))/sd(het$HET_RATE);
head(het_fail)
write.table(het_fail, "fail-het-qc.txt", row.names=FALSE)
结果中筛选出两个个体,在数据集中去除这两个个体
代码语言:javascript复制sed 's/"// g' fail-het-qc.txt | awk '{print$1, $2}'> het_fail_ind.txt
plink --bfile HapMap_3_r3_9 --remove het_fail_ind.txt --make-bed --out HapMap_3_r3_10
检查个体间的亲缘关系
代码语言:javascript复制plink --bfile HapMap_3_r3_10 --extract indepSNP.prune.in --genome --min 0.2 --out pihat_min0.2
awk '{ if ($8 >0.9) print $0 }' pihat_min0.2.genome > zoom_pihat.genome
对结果可视化
代码语言:javascript复制relatedness <- read.table("pihat_min0.2.genome", header=T)
par(pch=16, cex=1)
with(relatedness,plot(Z0,Z1, xlim=c(0,1), ylim=c(0,1), type="n"))
with(subset(relatedness,RT=="PO") , points(Z0,Z1,col=4))
with(subset(relatedness,RT=="UN") , points(Z0,Z1,col=3))
legend(1,1, xjust=1, yjust=1, legend=levels(relatedness$RT), pch=16, col=c(4,3))
image.png
代码语言:javascript复制relatedness_zoom <- read.table("zoom_pihat.genome", header=T)
par(pch=16, cex=1)
with(relatedness_zoom,plot(Z0,Z1, xlim=c(0,0.02), ylim=c(0.98,1), type="n"))
with(subset(relatedness_zoom,RT=="PO") , points(Z0,Z1,col=4))
with(subset(relatedness_zoom,RT=="UN") , points(Z0,Z1,col=3))
legend(0.02,1, xjust=1, yjust=1, legend=levels(relatedness$RT), pch=16, col=c(4,3))
image.png
代码语言:javascript复制hist(relatedness[,10],main="Histogram relatedness", xlab= "Pihat")
image.png
代码语言:javascript复制plink --bfile HapMap_3_r3_10 --filter-founders --make-bed --out HapMap_3_r3_11
plink --bfile HapMap_3_r3_11 --extract indepSNP.prune.in --genome --min 0.2 --out pihat_min0.2_in_founders
plink --bfile HapMap_3_r3_11 --missing
plink --bfile HapMap_3_r3_11 --remove 0.2_low_call_rate_pihat.txt --make-bed --out HapMap_3_r3_12
至此教程的第一部分就结束了,但是关于亲缘关系这部分代码还完全看不懂!还是先重复完流程再说吧!
接下来重复另外的内容
这部分内容好像还是质控,但是用到了更大的数据集,不太明白做这部分内容的意义,先把这部分代码运行完,因为接下来的步骤需要这部分运行的结果文件 代码
代码语言:javascript复制wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20100804/ALL.2of4intersection.20100804.genotypes.vcf.gz
plink --vcf ALL.2of4intersection.20100804.genotypes.vcf.gz --make-bed --out ALL.2of4intersection.20100804.genotypes
plink --bfile ALL.2of4intersection.20100804.genotypes --set-missing-var-ids @:#[b37]$1,$2 --make-bed --out ALL.2of4intersection.20100804.genotypes_no_missing_IDs
plink --bfile ALL.2of4intersection.20100804.genotypes_no_missing_IDs --geno 0.2 --allow-no-sex --make-bed --out 1kG_MDS
plink --bfile 1kG_MDS --mind 0.2 --allow-no-sex --make-bed --out 1kG_MDS2
plink --bfile 1kG_MDS2 --geno 0.02 --allow-no-sex --make-bed --out 1kG_MDS3
plink --bfile 1kG_MDS3 --mind 0.02 --allow-no-sex --make-bed --out 1kG_MDS4
plink --bfile 1kG_MDS4 --maf 0.05 --allow-no-sex --make-bed --out 1kG_MDS5
awk '{print$2}' HapMap_3_r3_12.bim > HapMap_SNPs.txt
plink --bfile 1kG_MDS5 --extract HapMap_SNPs.txt --make-bed --out 1kG_MDS6
awk '{print$2}' 1kG_MDS6.bim > 1kG_MDS6_SNPs.txt
plink --bfile HapMap_3_r3_12 --extract 1kG_MDS6_SNPs.txt --recode --make-bed --out HapMap_MDS
awk '{print$2,$4}' HapMap_MDS.map > buildhapmap.txt
plink --bfile 1kG_MDS6 --update-map buildhapmap.txt --make-bed --out 1kG_MDS7
awk '{print$2,$5}' 1kG_MDS7.bim > 1kg_ref-list.txt
plink --bfile HapMap_MDS --reference-allele 1kg_ref-list.txt --make-bed --out HapMap-adj
awk '{print$2,$5,$6}' 1kG_MDS7.bim > 1kGMDS7_tmp
awk '{print$2,$5,$6}' HapMap-adj.bim > HapMap-adj_tmp
sort 1kGMDS7_tmp HapMap-adj_tmp |uniq -u > all_differences.txt
awk '{print$1}' all_differences.txt | sort -u > flip_list.txt
plink --bfile HapMap-adj --flip flip_list.txt --reference-allele 1kg_ref-list.txt --make-bed --out corrected_hapmap
awk '{print$2,$5,$6}' corrected_hapmap.bim > corrected_hapmap_tmp
sort 1kGMDS7_tmp corrected_hapmap_tmp |uniq -u > uncorresponding_SNPs.txt
awk '{print$1}' uncorresponding_SNPs.txt | sort -u > SNPs_for_exlusion.txt
plink --bfile corrected_hapmap --exclude SNPs_for_exlusion.txt --make-bed --out HapMap_MDS2
plink --bfile 1kG_MDS7 --exclude SNPs_for_exlusion.txt --make-bed --out 1kG_MDS8
plink --bfile HapMap_MDS2 --bmerge 1kG_MDS8.bed 1kG_MDS8.bim 1kG_MDS8.fam --allow-no-sex --make-bed --out MDS_merge2
plink --bfile MDS_merge2 --extract indepSNP.prune.in --genome --out MDS_merge2
plink --bfile MDS_merge2 --read-genome MDS_merge2.genome --cluster --mds-plot 10 --out MDS_merge2
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20100804/20100804.ALL.panel
awk '{print$1,$1,$2}' 20100804.ALL.panel > race_1kG.txt
sed 's/JPT/ASN/g' race_1kG.txt>race_1kG2.txt
sed 's/ASW/AFR/g' race_1kG2.txt>race_1kG3.txt
sed 's/CEU/EUR/g' race_1kG3.txt>race_1kG4.txt
sed 's/CHB/ASN/g' race_1kG4.txt>race_1kG5.txt
sed 's/CHD/ASN/g' race_1kG5.txt>race_1kG6.txt
sed 's/YRI/AFR/g' race_1kG6.txt>race_1kG7.txt
sed 's/LWK/AFR/g' race_1kG7.txt>race_1kG8.txt
sed 's/TSI/EUR/g' race_1kG8.txt>race_1kG9.txt
sed 's/MXL/AMR/g' race_1kG9.txt>race_1kG10.txt
sed 's/GBR/EUR/g' race_1kG10.txt>race_1kG11.txt
sed 's/FIN/EUR/g' race_1kG11.txt>race_1kG12.txt
sed 's/CHS/ASN/g' race_1kG12.txt>race_1kG13.txt
sed 's/PUR/AMR/g' race_1kG13.txt>race_1kG14.txt
awk '{print$1,$2,"OWN"}' HapMap_MDS.fam>racefile_own.txt
cat race_1kG14.txt racefile_own.txt | sed -e '1iFID IID race' > racefile.txt
awk '{ if ($4 <-0.04 && $5 >0.03) print $1,$2 }' MDS_merge2.mds > EUR_MDS_merge2
plink --bfile HapMap_3_r3_12 --keep EUR_MDS_merge2 --make-bed --out HapMap_3_r3_13
plink --bfile HapMap_3_r3_13 --extract indepSNP.prune.in --genome --out HapMap_3_r3_13
plink --bfile HapMap_3_r3_13 --read-genome HapMap_3_r3_13.genome --cluster --mds-plot 10 --out HapMap_3_r3_13_mds
awk '{print$1, $2, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13}' HapMap_3_r3_13_mds.mds > covar_mds.txt
运行完以上代码就得到了下一步分析需要用到的covar_mds.txt文件
但是这一步用到的数据集很大,需要的时间会比较长
紧接着重复教程的第三部分内容
这部分教程用到了4个数据文件
- HapMap_3_r3_13.bed
- HapMap_3_r3_13.bim
- HapMap_3_r3_13.fam
- covar_mds.txt
代码
代码语言:javascript复制plink --bfile HapMap_3_r3_13 --assoc --out assoc_results
plink --bfile HapMap_3_r3_13 --covar covar_mds.txt --logistic --hide-covar --out logistic_results
awk '!/'NA'/' logistic_results.assoc.logistic > logistic_results.assoc_2.logistic
plink --bfile HapMap_3_r3_13 -assoc --adjust --out adjusted_assoc_results
awk '{ if ($4 >= 21595000 && $4 <= 21605000) print $2 }' HapMap_3_r3_13.bim > subset_snp_chr_22.txt
plink --bfile HapMap_3_r3_13 --extract subset_snp_chr_22.txt --make-bed --out HapMap_subset_for_perm
plink --bfile HapMap_subset_for_perm --assoc --mperm 1000000 --out subset_1M_perm_result
sort -gk 4 subset_1M_perm_result.assoc.mperm > sorted_subset.txt
head sorted_subset.txt
曼哈顿图
代码语言:javascript复制install.packages("qqman")
library(qqman)
jpeg("Logistic_manhattan.jpeg")
manhattan(results_log,chr="CHR",bp="BP",p="P",snp="SNP",main="Manhattan plot:logistic")
dev.off()
results_as <- read.table("assoc_results.assoc", head=TRUE)
jpeg("assoc_manhattan.jpeg")
manhattan(results_as,chr="CHR",bp="BP",p="P",snp="SNP", main = "Manhattan plot: assoc")
dev.off()
image.png
image.png
qq图
代码语言:javascript复制results_log <- read.table("logistic_results.assoc_2.logistic", head=TRUE)
jpeg("QQ-Plot_logistic.jpeg")
qq(results_log$P, main = "Q-Q plot of GWAS p-values : log")
dev.off()
results_as <- read.table("assoc_results.assoc", head=TRUE)
jpeg("QQ-Plot_assoc.jpeg")
qq(results_as$P, main = "Q-Q plot of GWAS p-values : log")
dev.off()
image.png
image.png
至此把教程重复了一遍,但是还有好多细节都不太懂,需要看教程对应的论文。重复教程的过程没有见到表型数据,不知道是为什么?