######################################################### FASTQ to VCF workflow ######################################################### Step 1: Create Reference Dictionary/Index (~1 hour) ######################################################### bwa index -a bwtsw hg19.fasta java -jar picard.jar CreateSequenceDictionary R= hg19.fasta O= hg19.dict samtools faidx hg19.fasta ######################################################### Step 2: Convert FASTQ file to Unmapped BAM (~40 minutes) ######################################################### java -Xmx8G -jar picard.jar FastqToSam \ FASTQ=SG5_R1.fastq \ FASTQ2=SG5_R2.fastq \ OUTPUT=SG5_fastqtosam.bam \ READ_GROUP_NAME=C7H7:5 \ SAMPLE_NAME=SG_5 \ #required LIBRARY_NAME=Agilent_SureSelect \ PLATFORM=illumina \ RUN_DATE=2017-01-20T00:30:30+00:00 #using iso8601 http://www.timestampgenerator.com/ ######################################################### STEP 3: Mark Illumina Adapters within uBAM file (~40 minutes) ######################################################### java -jar picard.jar MarkIlluminaAdapters \ I=SG5_fastqtosam.bam \ O=SG5_fastqtosam_markilluminaadapters.bam \ M=SG5_snippet_markilluminaadapters_metrics.txt 36.57 minutes ######################################################### STEP 4: Revert uBAM back to FASTQ, now with the recquired meta data (~20 minutes) ######################################################### java -jar picard.jar SamToFastq \ I=SG5_fastqtosam_markilluminaadapters.bam \ FASTQ=SG5_interleaved.fq \ CLIPPING_ATTRIBUTE=XT \ CLIPPING_ACTION=2 \ INTERLEAVE=true \ NON_PF=true \ # 0x200 failedVendorQualityChecks flag #85 ######################################################### STEP 5: Align FASTQ file to reference genome (~30 minutes) ######################################################### # nproc ->64 bwa mem -M -t 30 -p hg19.fasta \ SG5_interleaved.fq > SG5_Hg19.sam ######################################################### STEP 6: Convert aligned SAM file to BAM (~1.5 hours) ######################################################### java -jar picard.jar MergeBamAlignment \ R=hg19.fasta \ UNMAPPED_BAM=SG5_fastqtosam.bam \ ALIGNED_BAM=SG5_Hg19.sam \ O=SG5_final_hg19.bam \ CREATE_INDEX=true \ # ADD_MATE_CIGAR=true \ CLIP_ADAPTERS=false \ CLIP_OVERLAPPING_READS=true \ #default; soft-clips ends so mates do not overlap INCLUDE_SECONDARY_ALIGNMENTS=true \ MAX_INSERTIONS_OR_DELETIONS=-1 \ PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ ATTRIBUTES_TO_RETAIN=XS --------- Variant Calling (6 hours) java -jar GenomeAnalysisTK.jar \ -T HaplotypeCaller \ -R hg19.fasta \ -I SG6_final_hg19.bam \ -L 20 \ #Only used if you want to call variants for specific chromosome, ect. --genotyping_mode DISCOVERY \ -stand_call_conf 30 \ -o raw_variants.vcf java -jar GenomeAnalysisTK.jar \ -T VariantRecalibrator \ -R hg19.fasta \ -input SG5_raw_variants.vcf \ -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap.vcf \ -resource:omni,known=false,training=true,truth=true,prior=12.0 omni.vcf \ -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G.vcf \ -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.vcf \ -an QD \ -an FS \ -an SOR \ -an MQ \ -an MQRankSum \ -an ReadPosRankSum \ -mode SNP \ -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ -recalFile recalibrate_SNP.recal \ -tranchesFile recalibrate_SNP.tranches \ -rscriptFile recalibrate_SNP_plots.R java -jar GenomeAnalysisTK.jar \ -T ApplyRecalibration \ -R hg19.fasta \ -input SG5_raw_variants.vcf \ -mode SNP \ --ts_filter_level 99.0 \ -recalFile recalibrate_SNP.recal \ -tranchesFile recalibrate_SNP.tranches \ -o recalibrated_snps_raw_indels.vcf java -jar GenomeAnalysisTK.jar \ -T VariantRecalibrator \ -R hg19.fasta \ -input recalibrated_snps_raw_indels.vcf \ -resource:mills,known=false,training=true,truth=true,prior=12.0 Mills.vcf \ -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 dbsnp.vcf \ -an QD \ -an FS \ -an SOR \ -an MQRankSum \ -an ReadPosRankSum \ -mode INDEL \ -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \ --maxGaussians 4 \ -recalFile recalibrate_INDEL.recal \ --rscriptFile recalibrate_INDEL_plots.R -tranchesFile recalibrate_INDEL.tranches \ java -jar GenomeAnalysisTK.jar \ -T ApplyRecalibration \ -R reference.fa \ -input recalibrated_snps_raw_indels.vcf \ -mode INDEL \ --ts_filter_level 99.0 \ -recalFile recalibrate_INDEL.recal \ -tranchesFile recalibrate_INDEL.tranches \ -o recalibrated_variants.vcf