From 1a1181ebf7f70ef34394844eedf510099f6cd96f Mon Sep 17 00:00:00 2001 From: feizhao Date: Thu, 8 Apr 2021 11:01:13 +0800 Subject: [PATCH] new: add candidate region indel information --- .gitignore | 6 ++--- README.md | 5 +++- docs/manual_zh.md | 4 ++-- setup.py | 2 +- tritimap/Snakefile | 2 ++ tritimap/__init__.py | 2 +- tritimap/rules/vcf_pretreat.smk | 19 +++++++++++++++- tritimap/scripts/getuniqscaffold.sh | 10 ++++---- tritimap/scripts/getuniqscaffold_by_fasta.sh | 11 ++++----- tritimap/scripts/vcf2tab.sh | 24 +++++++++++++++++++- tritimap_env.yaml | 2 ++ 11 files changed, 65 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 58261b0..30451d1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -.DS_Store -*/.DS_Store +.DS_Store +*/.DS_Store .vscode/* !.vscode/settings.json @@ -12,4 +12,4 @@ .history/ meta.yaml -test_dir \ No newline at end of file +test_dir diff --git a/README.md b/README.md index cfcd56d..61d0209 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,10 @@ STAR --runThreadN 30 --runMode genomeGenerate \ #### Configuration files ```sh -# generate configuration file in running directory +# generate configuration file in current directory +tritimap init + +#Or generate configuration file in running directory tritimap init -d /your/work/path ``` diff --git a/docs/manual_zh.md b/docs/manual_zh.md index 6fd172a..bdf3f6c 100644 --- a/docs/manual_zh.md +++ b/docs/manual_zh.md @@ -16,7 +16,7 @@ 其中,Interval Mapping Module 和 Assembly Module 为命令行软件,输入为混池测序 DNA-Seq(ChIP-Seq/WGS)或 RNA-Seq 数据,可以一步生成包括性状关联区间、突变位点和新基因在内的多种结果。 -Web-based Annotation Module 为在线分析平台。Triti-Map 收集了小麦族各物种和六倍体小麦已有测序品种的基因组信息。统一进行了基因功能注释和转录因子结合位点预测,同时整合了各物种有代表性的表观修饰数据。该平台可以进行包括 SNP 注释与可视化展示、同源基因分析、小麦族共线性区间分析和新序列功能注释在内的多种分析,可以为小麦族基因克隆提供更加丰富的参考信息。 +Web-based Annotation Module 为[在线分析平台](http://bioinfo.sibs.ac.cn/tritimap/)。Triti-Map 收集了小麦族各物种和六倍体小麦已有测序品种的基因组信息。统一进行了基因功能注释和转录因子结合位点预测,同时整合了各物种有代表性的表观修饰数据。该平台可以进行包括 SNP 注释与可视化展示、同源基因分析、小麦族共线性区间分析和新序列功能注释在内的多种分析,可以为小麦族基因克隆提供更加丰富的参考信息。 ### Triti-Map 分析流程图 @@ -533,7 +533,7 @@ seqid hit_db hit_id hit_desc hit_url hsp_bit_score hsp_align_len hsp_identity hs ## 在线注释模块 -Triti-Map 的上游分析结果可以配套[在线分析平台](https://github.com/fei0810/Triti-Map)进行后续分析和可视化展示。包括对 SNP 进行功能和表观修饰注释,对基因进行同源基因分析,对性状关联区间进行共线性分析,对特异性片段进行功能注释等。 +Triti-Map 的上游分析结果可以配套[在线分析平台](http://bioinfo.sibs.ac.cn/tritimap/)进行后续分析和可视化展示。包括对 SNP 进行功能和表观修饰注释,对基因进行同源基因分析,对性状关联区间进行共线性分析,对特异性片段进行功能注释等。 **在线分析平台主要功能示意图** diff --git a/setup.py b/setup.py index eced6c4..b46dd12 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setuptools.setup( name="tritimap", - version="0.9.0", + version="0.9.2", author="Fei Zhao", author_email="zhaofei920810@gmail.com", url="https://github.com/fei0810/Triti-Map", diff --git a/tritimap/Snakefile b/tritimap/Snakefile index 8cab78b..a6500bf 100644 --- a/tritimap/Snakefile +++ b/tritimap/Snakefile @@ -185,6 +185,7 @@ if config['module'] == "only_mapping": input: join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"), join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_output.txt"), + join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt"), join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt"), join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_raw_region.txt"), join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_snpinfo.txt"), @@ -209,6 +210,7 @@ elif config['module'] == "all": input: join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"), join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_output.txt"), + join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt"), join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt"), join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_raw_region.txt"), join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_snpinfo.txt"), diff --git a/tritimap/__init__.py b/tritimap/__init__.py index d636ec7..8531c7e 100644 --- a/tritimap/__init__.py +++ b/tritimap/__init__.py @@ -1,5 +1,5 @@ import os -__version__ = "0.9.0" +__version__ = "0.9.2" root_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/tritimap/rules/vcf_pretreat.smk b/tritimap/rules/vcf_pretreat.smk index 65fd134..8f97fba 100644 --- a/tritimap/rules/vcf_pretreat.smk +++ b/tritimap/rules/vcf_pretreat.smk @@ -9,13 +9,14 @@ rule vcf2tab: scriptdir = script_dir output: snpindex = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"), + indel = temp(join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_indel.txt")), qtlseqr = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_qtlseqr_input.txt") message: "\nTransform genotype filterd VCF file to tab format. Input file: {input}\n" log: join(dir_path+"/logs", "_".join(samples.bulk.drop_duplicates()) + "_vcf2tab.log") shell:""" set +e - bash {params.scriptdir}/vcf2tab.sh {input} {output.snpindex} {output.qtlseqr} {params.pool1} {params.pool2} {params.snpdepth} {params.genome} > {log} 2>&1 + bash {params.scriptdir}/vcf2tab.sh {input} {output.snpindex} {output.qtlseqr} {params.pool1} {params.pool2} {params.snpdepth} {params.genome} {output.indel} > {log} 2>&1 exitcode=$? if [ $exitcode -eq 1 ] then @@ -23,4 +24,20 @@ rule vcf2tab: else exit 0 fi + """ + +rule getRegionIndel: + input: + indel = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_indel.txt"), + region=join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt") + params: + dirpath = join(dir_path+"/06_regionout") + output: + join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt") + message: "\nGet indel information in candidate region. Input file: {input}\n" + log: + join(dir_path+"/logs", "_".join(samples.bulk.drop_duplicates()) + "_getregionindel.log") + shell:""" + cat {input.region} | grep -v 'start' | cut -f1,3,4 | sort -k1,1 -k2,2n > {params.dirpath}/temp.candidateregion.indel.bed + bedmap --skip-unmapped --delim '\t' --header --echo {input.indel} {params.dirpath}/temp.candidateregion.indel.bed | cat <(head -n1 {input.indel}) - > {output} && rm {params.dirpath}/temp.candidateregion.indel.bed """ \ No newline at end of file diff --git a/tritimap/scripts/getuniqscaffold.sh b/tritimap/scripts/getuniqscaffold.sh index 9b9dd69..af7890c 100755 --- a/tritimap/scripts/getuniqscaffold.sh +++ b/tritimap/scripts/getuniqscaffold.sh @@ -44,8 +44,8 @@ bioawk -v len=$length -c fastx '{if(length($seq)>=len){print ">"$name;print $seq # map to ref if [ $type == "dna" ]; then - bwa-mem2 mem -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam - - bwa-mem2 mem -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam - + bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam - + bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam - elif [ $type == "rna" ]; then minimap2 -ax splice --split-prefix minimap_${id1} -I $mem $genome ${dir}/temp.${id1}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id1}.scaffolds.500.bam - minimap2 -ax splice --split-prefix minimap_${id2} -I $mem $genome ${dir}/temp.${id2}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id2}.scaffolds.500.bam - @@ -90,7 +90,7 @@ samtools faidx -r ${dir}/temp.${id1}2${id2}_samtools.region.txt $genome >${dir}/ makeblastdb -in ${dir}/temp.${id1}2${id2}_samtools.region.fasta -dbtype nucl -# map to database +# map to database blastn -db ${dir}/temp.${id1}2${id2}_samtools.region.fasta -query ${dir}/temp.${id1}_step2_candidate.fasta -out ${dir}/temp.${id1}_step3_candidate.outfmt6 -outfmt 6 -num_threads $thread cut -f1 ${dir}/temp.${id1}_step3_candidate.outfmt6 | sort -u >${dir}/temp.${id1}_step3_candidate.id seqkit grep -n -f ${dir}/temp.${id1}_step3_candidate.id ${dir}/temp.${id1}_step2_candidate.fasta >${dir}/temp.${id1}_step3_candidate.fasta @@ -116,7 +116,6 @@ cat <(awk 'BEGIN{print "seqid\tchrom\tstart\tend\thit_length\thit_score"}') ${di samtools view -f 4 ${dir}/temp.${id1}.scaffolds.500.bam | awk -v name=${id1} -F "\t" '{print ">"$1":"length($10)":"name"\n"$10}' >${dir}/temp.${id1}_unmap.fasta samtools view -f 4 ${dir}/temp.${id2}.scaffolds.500.bam | awk -v name=${id2} -F "\t" '{print ">"$1":"length($10)":"name"\n"$10}' >${dir}/temp.${id2}_unmap.fasta - makeblastdb -in ${dir}/temp.${id1}_unmap.fasta -dbtype nucl makeblastdb -in ${dir}/temp.${id2}_unmap.fasta -dbtype nucl @@ -141,8 +140,7 @@ seqkit grep -n -v -f ${dir}/temp.${id2}_unmap_needfilter.id ${dir}/temp.${id2}_u cp ${dir}/temp.${id1}_step2_unmap.fasta $unmap1 cp ${dir}/temp.${id2}_step2_unmap.fasta $unmap2 -if [ -d $dir/temp_output ];then +if [ -d $dir/temp_output ]; then rm -rf $dir/temp_output fi mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output - diff --git a/tritimap/scripts/getuniqscaffold_by_fasta.sh b/tritimap/scripts/getuniqscaffold_by_fasta.sh index 28e6c5b..2037480 100755 --- a/tritimap/scripts/getuniqscaffold_by_fasta.sh +++ b/tritimap/scripts/getuniqscaffold_by_fasta.sh @@ -19,7 +19,6 @@ # ############################################################################### - input1=$(realpath $1) id1=$(basename $1 | sed 's/_denovo_scaffolds.fasta//') input2=$(realpath $2) @@ -45,8 +44,8 @@ bioawk -v len=$length -c fastx '{if(length($seq)>=len){print ">"$name;print $seq # map to ref if [ $type == "dna" ]; then - bwa-mem2 mem -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam - - bwa-mem2 mem -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam - + bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam - + bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam - elif [ $type == "rna" ]; then minimap2 -ax splice --split-prefix minimap_${id1} -I $mem $genome ${dir}/temp.${id1}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id1}.scaffolds.500.bam - minimap2 -ax splice --split-prefix minimap_${id2} -I $mem $genome ${dir}/temp.${id2}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id2}.scaffolds.500.bam - @@ -93,7 +92,7 @@ ln -s $database ${dir}/temp.${id1}2${id2}_samtools.region.fasta makeblastdb -in ${dir}/temp.${id1}2${id2}_samtools.region.fasta -dbtype nucl -# map to database +# map to database blastn -db ${dir}/temp.${id1}2${id2}_samtools.region.fasta -query ${dir}/temp.${id1}_step2_candidate.fasta -out ${dir}/temp.${id1}_step3_candidate.outfmt6 -outfmt 6 -num_threads $thread cut -f1 ${dir}/temp.${id1}_step3_candidate.outfmt6 | sort -u >${dir}/temp.${id1}_step3_candidate.id seqkit grep -n -f ${dir}/temp.${id1}_step3_candidate.id ${dir}/temp.${id1}_step2_candidate.fasta >${dir}/temp.${id1}_step3_candidate.fasta @@ -144,7 +143,7 @@ seqkit grep -n -v -f ${dir}/temp.${id2}_unmap_needfilter.id ${dir}/temp.${id2}_u cp ${dir}/temp.${id1}_step2_unmap.fasta $unmap1 cp ${dir}/temp.${id2}_step2_unmap.fasta $unmap2 -if [ -d $dir/temp_output ];then +if [ -d $dir/temp_output ]; then rm -rf $dir/temp_output fi -mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output \ No newline at end of file +mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output diff --git a/tritimap/scripts/vcf2tab.sh b/tritimap/scripts/vcf2tab.sh index d76bea6..f0a0847 100755 --- a/tritimap/scripts/vcf2tab.sh +++ b/tritimap/scripts/vcf2tab.sh @@ -28,6 +28,7 @@ pool1=$4 pool2=$5 genome=$7 depth=$6 +output3=$(realpath $8) gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \ SelectVariants \ @@ -43,6 +44,19 @@ gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \ -select 'vc.getGenotype("'$pool1'").getDP() >= '$depth' && vc.getGenotype("'$pool2'").getDP() >= '$depth'' \ -O $dir/temp.${id}.realuse.vcf +gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \ + SelectVariants \ + -R $genome \ + -V $input \ + --exclude-filtered \ + -select-type INDEL \ + --max-nocall-number 0 \ + -sn $pool1 \ + -sn $pool2 \ + -select 'DP >= '$depth'' \ + -select 'vc.getGenotype("'$pool1'").getDP() >= '$depth' && vc.getGenotype("'$pool2'").getDP() >= '$depth'' \ + -O $dir/temp.${id}.realuse.indel.vcf + gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \ VariantsToTable \ -V $dir/temp.${id}.realuse.vcf \ @@ -50,10 +64,18 @@ gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \ -F CHROM -F POS -F REF -F ALT -GF AD -GF DP \ -O $dir/temp.${id}.realuse.gatk.table +gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \ + VariantsToTable \ + -V $dir/temp.${id}.realuse.indel.vcf \ + -R $genome \ + -F CHROM -F POS -F REF -F ALT -GF AD -GF DP \ + -O $dir/temp.${id}.realuse.indel.gatk.table + if [ -s $dir/temp.${id}.realuse.gatk.table ]; then cat $dir/temp.${id}.realuse.gatk.table | awk '$4!~/,/' | tr ',' '\t' | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t";print "#CHROM\tPOSm1\tPOS\tREF\tALT\t"bulk1"_ref\t"bulk1"_alt\t"bulk1"_depth\t"bulk1"_ratio\t"bulk2"_ref\t"bulk2"_alt\t"bulk2"_depth\t"bulk2"_ratio\tsnpindex"}NR>1{print $1,$2-1,$2,$3,$4,$5,$6,$7,$6/$7,$8,$9,$10,$9/$10,$6/$7-$9/$10}' >$output1 cat $output1 | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t"; print "CHROM\tPOS\tAD_REF."bulk1"\tAD_ALT."bulk1"\tAD_REF."bulk2"\tAD_ALT."bulk2}NR>1{print $1,$3,$6,$7,$10,$11}' >$output2 - if [ -d $dir/temp_output ];then + cat $dir/temp.${id}.realuse.indel.gatk.table | awk '$4!~/,/' | tr ',' '\t' | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t";print "#CHROM\tPOSm1\tPOS\tREF\tALT\t"bulk1"_ref\t"bulk1"_alt\t"bulk1"_depth\t"bulk1"_ratio\t"bulk2"_ref\t"bulk2"_alt\t"bulk2"_depth\t"bulk2"_ratio\tsnpindex"}NR>1{print $1,$2-1,$2,$3,$4,$5,$6,$7,$6/$7,$8,$9,$10,$9/$10,$6/$7-$9/$10}' >$output3 + if [ -d $dir/temp_output ]; then rm -rf $dir/temp_output fi mkdir $dir/temp_output && mv $dir/temp.${id}* $dir/temp_output diff --git a/tritimap_env.yaml b/tritimap_env.yaml index f3cd5ec..ae5ff08 100644 --- a/tritimap_env.yaml +++ b/tritimap_env.yaml @@ -12,6 +12,8 @@ dependencies: - biopython - fastp>=0.20.1 - bioawk + - blast >=2.5.0 + - bedops >=2.4.39 - bwa-mem2=2.2.1 - samtools=1.12 - seqkit>=0.15.0