GATK变异检测

2023-09-04 15:08:51 浏览数 (2)

一、生成bam

代码语言:javascript复制
#fastqc质控
mkdir 20.human;cd 20.human
mkdir ref;cd ref
axel -n 100 https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta
#建立bwa-mem2索引
bwa-mem2 index -p Homo_sapiens_assembly38.fasta Homo_sapiens_assembly38.fasta

vi reads.list
/share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L001_R1_001.fastq.gz /share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L001_R2_001.fastq.gz
/share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L001_R1_002.fastq.gz /share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L001_R2_002.fastq.gz
/share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L001_R1_003.fastq.gz /share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L001_R2_003.fastq.gz
/share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L001_R1_004.fastq.gz /share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L001_R2_004.fastq.gz
/share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L002_R1_001.fastq.gz /share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L002_R2_001.fastq.gz
/share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L002_R1_002.fastq.gz /share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L002_R2_002.fastq.gz
/share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L002_R1_003.fastq.gz /share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L002_R2_003.fastq.gz
/share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L002_R1_004.fastq.gz /share/home/xiehs/data/Project_RM8398/Sample_U0a/U0a_CGATGT_L002_R2_004.fastq.gz

mkdir qc
nohup fastqc -f fastq -o qc -t 16 /share/home/xiehs/data/Project_RM8398/Sample_U0a/*.fastq.gz &
multiqc qc -o multiqc
#fastp过滤
cat reads.list | while read {i,j};do echo fastp -i ${i} -I ${j} -o ${i%*.fastq.gz}_clean.fastq.gz -O ${j%*.fastq.gz}_clean.fastq.gz -z 4 -q 20 -u 40 -n 10 ;done;
sed -i 's#/share/home/xiehs/data/Project_RM8398/Sample_U0a/##3' fastp.sh
sed -i 's#/share/home/xiehs/data/Project_RM8398/Sample_U0a/##3' fastp.sh
bsub -q fat -n 10 -o %J.log -e %J.err sh fastp.sh

vi clean.list
ls *.gz | xargs -n 2 #后在修改
clean/U0a_CGATGT_L001_R1_001_clean.fastq.gz clean/U0a_CGATGT_L001_R2_001_clean.fastq.gz
clean/U0a_CGATGT_L001_R1_002_clean.fastq.gz clean/U0a_CGATGT_L001_R2_002_clean.fastq.gz
clean/U0a_CGATGT_L001_R1_003_clean.fastq.gz clean/U0a_CGATGT_L001_R2_003_clean.fastq.gz
clean/U0a_CGATGT_L001_R1_004_clean.fastq.gz clean/U0a_CGATGT_L001_R2_004_clean.fastq.gz
clean/U0a_CGATGT_L002_R1_001_clean.fastq.gz clean/U0a_CGATGT_L002_R2_001_clean.fastq.gz
clean/U0a_CGATGT_L002_R1_002_clean.fastq.gz clean/U0a_CGATGT_L002_R2_002_clean.fastq.gz
clean/U0a_CGATGT_L002_R1_003_clean.fastq.gz clean/U0a_CGATGT_L002_R2_003_clean.fastq.gz
clean/U0a_CGATGT_L002_R1_004_clean.fastq.gz clean/U0a_CGATGT_L002_R2_004_clean.fastq.gz

#bwa-mem2比对
mv filter clean
cat clean.list | while read {i,j};do echo bwa-mem2 mem -t 4 -o ${i%*_clean.fastq.gz}.sam -R '@RG\tID:A1\tPL:illumina\tSM:human' /share/home/xiehs/20.human/ref/Homo_sapiens_assembly38.fasta ${i} ${j};done;
bsub -q fat -n 4 -o %J.log -e %J.err sh bwa.sh

二、samtools 处理

代码语言:javascript复制
ls -1 *.sam | while read i;do echo samtools sort -@ 12 -O bam -o ${i%.sam}.sorted.bam $i >>sam2bam.sh;done;
#报错参考 https://www.cnblogs.com/huanping/p/13786701.html
ln -s /usr/lib64/libcrypto.so.1.0.2k /share/home/xiehs/Software/miniconda3/envs/human/lib/libcrypto.so.1.0.0

bsub -q fat -n 12 -o %J.log -e %J.err sh sam2bam.sh
ls -1 *.bam | while read i ;do echo samtools index -@ 24 ${i}>>bai.sh;done;
bsub -q fat -n 24 -o %J.log -e %J.err sh bai.sh
# 一步管道命令
# bwa mem -t 4 -R '@RGtID:A1tPL:illuminatSM:human' ref.fna clean.1.fq.gz clean.2.fq.gz | samtools view -Sb - | samtools sort -> A1.sort.bam


echo samtools merge -@ 24 -t -o merge.sorted.bam *.bam >merge.sh
bsub -q fat -n 24 -o %J.log -e %J.err sh merge.sh
samtools index merge.sorted.bam
samtools quickcheck merge.sorted.bam

三、标记 Duplication

代码语言:javascript复制
gatk MarkDuplicates -I merge.sorted.bam -M merge.markdup_metrics.txt -O merge.sorted.markdup.bam
samtools index merge.sorted.markdup.bam 

Duplication 对变异检测的影响

四、BQSR

代码语言:javascript复制
#建立模型 --java-options "-Xmx8G -Djava.io.tmpdir=./" 加上就不报错了,不加会报错
time gatk BaseRecalibrator --java-options "-Xmx8G -Djava.io.tmpdir=./" -R /share/home/xiehs/data/GATK/hg38/Homo_sapiens_assembly38.fasta -I merge.sorted.markdup.bam --known-sites /share/home/xiehs/data/GATK/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz --known-sites /share/home/xiehs/data/GATK/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /share/home/xiehs/data/GATK/hg38/dbsnp_146.hg38.vcf.gz -O merge.sorted.markdup.recal_data.table >bqsr.log
#应用模型
time gatk ApplyBQSR --bqsr-recal-file merge.sorted.markdup.recal_data.table -R /share/home/xiehs/data/GATK/hg38/Homo_sapiens_assembly38.fasta -I merge.sorted.markdup.bam -O merge.sorted.markdup.BQSR.bam
samtools flagstat merge.sorted.markdup.BQSR.bam
#建立索引
time samtools index merge.sorted.markdup.BQSR.bam

五、变异检测

代码语言:javascript复制
#生成gvcf  
time gatk HaplotypeCaller --emit-ref-confidence GVCF -R /share/home/xiehs/data/GATK/hg38/Homo_sapiens_assembly38.fasta -I merge.sorted.markdup.BQSR.bam -O merge.HC.g.vcf.gz 

#合并gvcf  
time gatk GenotypeGVCFs -R /share/home/xiehs/data/GATK/hg38/Homo_sapiens_assembly38.fasta -V merge.HC.g.vcf.gz -O merge.HC.vcf.gz

六、结果过滤

6.1 VQSR

准备的已知变异集作为训练集,可以是 Hapmap、OMNI,1000G,dbsnp,瓶中基因组计划等这些国际性项目的数据,然后利用训练集对每一个位点进行过滤。利用 VariantRecalibrator工具进行机器学习,ApplyVQSR 工具进行处理。VQSR 过滤 SNP 和 InDel 分别进行,首先处理 SNP,得到结果后,再进行 InDel 处理。

代码语言:javascript复制
#处理SNP  
# --max-gaussians默认值为8,报错提示需要降低
gatk VariantRecalibrator --max-gaussians 6 -R /share/home/xiehs/data/GATK/hg38/Homo_sapiens_assembly38.fasta -V merge.HC.vcf.gz --resource:hapmap,known=false,training=true,truth=true,prior=15.0 /share/home/xiehs/data/GATK/hg38/hapmap_3.3.hg38.vcf.gz --resource:omni,known=false,training=true,truth=false,prior=12.0 /share/home/xiehs/data/GATK/hg38/1000G_omni2.5.hg38.vcf.gz --resource:1000G,known=false,training=true,truth=false,prior=10.0 /share/home/xiehs/data/GATK/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 /share/home/xiehs/data/GATK/hg38/dbsnp_146.hg38.vcf.gz -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -mode SNP -O merge.HC.snps.recal --tranches-file output.tranches --rscript-file output.plots.R

 
gatk ApplyVQSR -R /share/home/xiehs/data/GATK/hg38/Homo_sapiens_assembly38.fasta -V merge.HC.vcf.gz -O merge.HC.snps.VQSR.vcf.gz --recal-file merge.HC.snps.recal --tranches-file merge.HC.snps.tranches -mode SNP

1、HapMap,它来自国际人类单倍体型图计划,数据集包含了大量家系数据,并且有非常严格的质控和严密的实验验证,因此它的准确性是目前公认最高的。

2、Omni,这个数据源自 Illumina 的 Omni 基因型芯片,它的验证结果常常作为基因型的金标准。

3、1000G 千人基因组计划(1000 genomes project)质控后的变异数据,质控后,它包含的绝大部分都是真实的变异,但由于没办法做全面的实验验证,并不能排除含有少部分假阳性的结果。

4、dbSNP。dbSNP 收集的数据,实际都是研究者们发表了相关文章提交上来的变异,这些变异很多是没做过严格验证的。

处理 InDel

代码语言:javascript复制
#处理InDel  
gatk VariantRecalibrator -R /share/home/xiehs/data/GATK/hg38/Homo_sapiens_assembly38.fasta -V merge.HC.snps.VQSR.vcf.gz --max-gaussians 4 --resource:mills,known=false,training=true,truth=true,prior=12.0 /share/home/xiehs/data/GATK/hg38/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 /share/home/xiehs/data/GATK/hg38/dbsnp_146.hg38.vcf.gz -an QD -an DP -an FS -an SOR -an ReadPosRankSum -an MQRankSum -mode INDEL -O merge.HC.snps.indel.recal --tranches-file merge.HC.snps.indel.tranches --rscript-file merge.HC.snps.indel.plots.R

gatk ApplyVQSR -R /share/home/xiehs/data/GATK/hg38/Homo_sapiens_assembly38.fasta -V merge.HC.snps.VQSR.vcf.gz -O merge.HC.snps.indel.VQSR.vcf.gz --truth-sensitivity-filter-level 99.0 --tranches-file merge.HC.snps.indel.tranches --recal-file merge.HC.snps.indel.recal -mode INDEL

6.2 Hard-filter

Hard-filter 硬过滤,可以根据以下标准来进行过滤,gatk 过滤的时候,snp 与 indel 是分别进行的。也可以选择 bcftools 进行简单过滤。

QualByDepth(QD)

FisherStrand (FS)

StrandOddsRatio (SOR)

RMSMappingQuality (MQ)

MappingQualityRankSumTest (MQRankSum)

ReadPosRankSumTest (ReadPosRankSum)

此脚本为硬过滤(hard-filter)的方法,主要用于不能进行 VQSR 的情况,例如非人物种,或外显子,芯片数据等。

代码语言:javascript复制
# 使用SelectVariants,选出SNP  
time gatk SelectVariants -select-type SNP -V merge.HC.vcf.gz -O merge.HC.vcf.snp.gz
  
# 为SNP作硬过滤  
time gatk VariantFiltration -V merge.HC.vcf.snp.gz --filter-expression "QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "PASS" -O merge.HC.vcf.snp.filter.gz
  
# 使用SelectVariants,选出Indel  
time gatk SelectVariants -select-type INDEL -V merge.HC.vcf.gz -O merge.HC.indel.vcf.gz
  
# 为Indel作过滤  
time gatk VariantFiltration -V merge.HC.indel.vcf.gz --filter-expression "QD < 2.0 || FS > 200.0 || SOR > 10.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "PASS" -O merge.HC.indel.filter.vcf.gz
  
# 重新合并过滤后的SNP和Indel  
time gatk MergeVcfs -I merge.HC.snp.filter.vcf.gz -I merge.HC.indel.filter.vcf.gz -O merge.HC.filter.vcf.gz

0 人点赞