new: add candidate region indel information

fei0810 · Apr 8, 2021 · 1a1181e · 1a1181e
1 parent b718b01
commit 1a1181e
Show file tree

Hide file tree

Showing 11 changed files with 65 additions and 22 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
-.DS_Store 
-*/.DS_Store 
+.DS_Store
+*/.DS_Store
 
 .vscode/*
 !.vscode/settings.json
@@ -12,4 +12,4 @@
 .history/
 
 meta.yaml
-test_dir
+test_dir
diff --git a/README.md b/README.md
@@ -95,7 +95,10 @@ STAR --runThreadN 30 --runMode genomeGenerate \
 #### Configuration files
 
 ```sh
-# generate configuration file in running directory
+# generate configuration file in current directory
+tritimap init
+
+#Or generate configuration file in running directory
 tritimap init -d /your/work/path
 ```
 

diff --git a/docs/manual_zh.md b/docs/manual_zh.md
@@ -16,7 +16,7 @@
 
 其中，Interval Mapping Module 和 Assembly Module 为命令行软件，输入为混池测序 DNA-Seq（ChIP-Seq/WGS）或 RNA-Seq 数据，可以一步生成包括性状关联区间、突变位点和新基因在内的多种结果。
 
-Web-based Annotation Module 为在线分析平台。Triti-Map 收集了小麦族各物种和六倍体小麦已有测序品种的基因组信息。统一进行了基因功能注释和转录因子结合位点预测，同时整合了各物种有代表性的表观修饰数据。该平台可以进行包括 SNP 注释与可视化展示、同源基因分析、小麦族共线性区间分析和新序列功能注释在内的多种分析，可以为小麦族基因克隆提供更加丰富的参考信息。
+Web-based Annotation Module 为[在线分析平台](http://bioinfo.sibs.ac.cn/tritimap/)。Triti-Map 收集了小麦族各物种和六倍体小麦已有测序品种的基因组信息。统一进行了基因功能注释和转录因子结合位点预测，同时整合了各物种有代表性的表观修饰数据。该平台可以进行包括 SNP 注释与可视化展示、同源基因分析、小麦族共线性区间分析和新序列功能注释在内的多种分析，可以为小麦族基因克隆提供更加丰富的参考信息。
 
 ### Triti-Map 分析流程图
 
@@ -533,7 +533,7 @@ seqid hit_db hit_id hit_desc hit_url hsp_bit_score hsp_align_len hsp_identity hs
 
 ## 在线注释模块
 
-Triti-Map 的上游分析结果可以配套[在线分析平台](https://github.com/fei0810/Triti-Map)进行后续分析和可视化展示。包括对 SNP 进行功能和表观修饰注释，对基因进行同源基因分析，对性状关联区间进行共线性分析，对特异性片段进行功能注释等。
+Triti-Map 的上游分析结果可以配套[在线分析平台](http://bioinfo.sibs.ac.cn/tritimap/)进行后续分析和可视化展示。包括对 SNP 进行功能和表观修饰注释，对基因进行同源基因分析，对性状关联区间进行共线性分析，对特异性片段进行功能注释等。
 
 **在线分析平台主要功能示意图**
 

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setuptools.setup(
  name="tritimap",
- version="0.9.0",
+ version="0.9.2",
  author="Fei Zhao",
  author_email="zhaofei920810@gmail.com",
  url="https://github.com/fei0810/Triti-Map",

diff --git a/tritimap/Snakefile b/tritimap/Snakefile
@@ -185,6 +185,7 @@ if config['module'] == "only_mapping":
  input:
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_output.txt"),
+ join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt"),
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt"),
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_raw_region.txt"),
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_snpinfo.txt"),
@@ -209,6 +210,7 @@ elif config['module'] == "all":
  input:
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_output.txt"),
+ join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt"),
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt"),
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_raw_region.txt"),
  join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_snpinfo.txt"),

diff --git a/tritimap/__init__.py b/tritimap/__init__.py
@@ -1,5 +1,5 @@
 import os
 
-__version__ = "0.9.0"
+__version__ = "0.9.2"
 
 root_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/tritimap/rules/vcf_pretreat.smk b/tritimap/rules/vcf_pretreat.smk
@@ -9,18 +9,35 @@ rule vcf2tab:
  scriptdir = script_dir
  output:
  snpindex = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
+ indel = temp(join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_indel.txt")),
  qtlseqr = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_qtlseqr_input.txt")
  message: "\nTransform genotype filterd VCF file to tab format. Input file: {input}\n"
  log:
  join(dir_path+"/logs", "_".join(samples.bulk.drop_duplicates()) + "_vcf2tab.log")
  shell:"""
  set +e
- bash {params.scriptdir}/vcf2tab.sh {input} {output.snpindex} {output.qtlseqr} {params.pool1} {params.pool2} {params.snpdepth} {params.genome} > {log} 2>&1
+ bash {params.scriptdir}/vcf2tab.sh {input} {output.snpindex} {output.qtlseqr} {params.pool1} {params.pool2} {params.snpdepth} {params.genome} {output.indel} > {log} 2>&1
  exitcode=$?
  if [ $exitcode -eq 1 ]
  then
  exit 1
  else
  exit 0
  fi
+ """
+
+rule getRegionIndel:
+ input:
+ indel = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_indel.txt"),
+ region=join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt")
+ params:
+ dirpath = join(dir_path+"/06_regionout")
+ output:
+ join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt")
+ message: "\nGet indel information in candidate region. Input file: {input}\n"
+ log:
+ join(dir_path+"/logs", "_".join(samples.bulk.drop_duplicates()) + "_getregionindel.log")
+ shell:"""
+ cat {input.region} | grep -v 'start' | cut -f1,3,4 | sort -k1,1 -k2,2n > {params.dirpath}/temp.candidateregion.indel.bed
+ bedmap --skip-unmapped --delim '\t' --header --echo {input.indel} {params.dirpath}/temp.candidateregion.indel.bed | cat <(head -n1 {input.indel}) - > {output} && rm {params.dirpath}/temp.candidateregion.indel.bed
  """
diff --git a/tritimap/scripts/getuniqscaffold.sh b/tritimap/scripts/getuniqscaffold.sh
@@ -44,8 +44,8 @@ bioawk -v len=$length -c fastx '{if(length($seq)>=len){print ">"$name;print $seq
 
 # map to ref
 if [ $type == "dna" ]; then
- bwa-mem2 mem -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
- bwa-mem2 mem -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
+ bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
+ bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
 elif [ $type == "rna" ]; then
  minimap2 -ax splice --split-prefix minimap_${id1} -I $mem $genome ${dir}/temp.${id1}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id1}.scaffolds.500.bam -
  minimap2 -ax splice --split-prefix minimap_${id2} -I $mem $genome ${dir}/temp.${id2}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id2}.scaffolds.500.bam -
@@ -90,7 +90,7 @@ samtools faidx -r ${dir}/temp.${id1}2${id2}_samtools.region.txt $genome >${dir}/
 
 makeblastdb -in ${dir}/temp.${id1}2${id2}_samtools.region.fasta -dbtype nucl
 
-# map to database 
+# map to database
 blastn -db ${dir}/temp.${id1}2${id2}_samtools.region.fasta -query ${dir}/temp.${id1}_step2_candidate.fasta -out ${dir}/temp.${id1}_step3_candidate.outfmt6 -outfmt 6 -num_threads $thread
 cut -f1 ${dir}/temp.${id1}_step3_candidate.outfmt6 | sort -u >${dir}/temp.${id1}_step3_candidate.id
 seqkit grep -n -f ${dir}/temp.${id1}_step3_candidate.id ${dir}/temp.${id1}_step2_candidate.fasta >${dir}/temp.${id1}_step3_candidate.fasta
@@ -116,7 +116,6 @@ cat <(awk 'BEGIN{print "seqid\tchrom\tstart\tend\thit_length\thit_score"}') ${di
 samtools view -f 4 ${dir}/temp.${id1}.scaffolds.500.bam | awk -v name=${id1} -F "\t" '{print ">"$1":"length($10)":"name"\n"$10}' >${dir}/temp.${id1}_unmap.fasta
 samtools view -f 4 ${dir}/temp.${id2}.scaffolds.500.bam | awk -v name=${id2} -F "\t" '{print ">"$1":"length($10)":"name"\n"$10}' >${dir}/temp.${id2}_unmap.fasta
 
-
 makeblastdb -in ${dir}/temp.${id1}_unmap.fasta -dbtype nucl
 makeblastdb -in ${dir}/temp.${id2}_unmap.fasta -dbtype nucl
 
@@ -141,8 +140,7 @@ seqkit grep -n -v -f ${dir}/temp.${id2}_unmap_needfilter.id ${dir}/temp.${id2}_u
 cp ${dir}/temp.${id1}_step2_unmap.fasta $unmap1
 cp ${dir}/temp.${id2}_step2_unmap.fasta $unmap2
 
-if [ -d $dir/temp_output ];then
+if [ -d $dir/temp_output ]; then
  rm -rf $dir/temp_output
 fi
 mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output
-
diff --git a/tritimap/scripts/getuniqscaffold_by_fasta.sh b/tritimap/scripts/getuniqscaffold_by_fasta.sh
@@ -19,7 +19,6 @@
 #
 ###############################################################################
 
-
 input1=$(realpath $1)
 id1=$(basename $1 | sed 's/_denovo_scaffolds.fasta//')
 input2=$(realpath $2)
@@ -45,8 +44,8 @@ bioawk -v len=$length -c fastx '{if(length($seq)>=len){print ">"$name;print $seq
 
 # map to ref
 if [ $type == "dna" ]; then
- bwa-mem2 mem -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
- bwa-mem2 mem -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
+ bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
+ bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
 elif [ $type == "rna" ]; then
  minimap2 -ax splice --split-prefix minimap_${id1} -I $mem $genome ${dir}/temp.${id1}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id1}.scaffolds.500.bam -
  minimap2 -ax splice --split-prefix minimap_${id2} -I $mem $genome ${dir}/temp.${id2}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id2}.scaffolds.500.bam -
@@ -93,7 +92,7 @@ ln -s $database ${dir}/temp.${id1}2${id2}_samtools.region.fasta
 
 makeblastdb -in ${dir}/temp.${id1}2${id2}_samtools.region.fasta -dbtype nucl
 
-# map to database 
+# map to database
 blastn -db ${dir}/temp.${id1}2${id2}_samtools.region.fasta -query ${dir}/temp.${id1}_step2_candidate.fasta -out ${dir}/temp.${id1}_step3_candidate.outfmt6 -outfmt 6 -num_threads $thread
 cut -f1 ${dir}/temp.${id1}_step3_candidate.outfmt6 | sort -u >${dir}/temp.${id1}_step3_candidate.id
 seqkit grep -n -f ${dir}/temp.${id1}_step3_candidate.id ${dir}/temp.${id1}_step2_candidate.fasta >${dir}/temp.${id1}_step3_candidate.fasta
@@ -144,7 +143,7 @@ seqkit grep -n -v -f ${dir}/temp.${id2}_unmap_needfilter.id ${dir}/temp.${id2}_u
 cp ${dir}/temp.${id1}_step2_unmap.fasta $unmap1
 cp ${dir}/temp.${id2}_step2_unmap.fasta $unmap2
 
-if [ -d $dir/temp_output ];then
+if [ -d $dir/temp_output ]; then
  rm -rf $dir/temp_output
 fi
-mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output
+mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output
diff --git a/tritimap/scripts/vcf2tab.sh b/tritimap/scripts/vcf2tab.sh
@@ -28,6 +28,7 @@ pool1=$4
 pool2=$5
 genome=$7
 depth=$6
+output3=$(realpath $8)
 
 gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
  SelectVariants \
@@ -43,17 +44,38 @@ gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
  -select 'vc.getGenotype("'$pool1'").getDP() >= '$depth' && vc.getGenotype("'$pool2'").getDP() >= '$depth'' \
  -O $dir/temp.${id}.realuse.vcf
 
+gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
+ SelectVariants \
+ -R $genome \
+ -V $input \
+ --exclude-filtered \
+ -select-type INDEL \
+ --max-nocall-number 0 \
+ -sn $pool1 \
+ -sn $pool2 \
+ -select 'DP >= '$depth'' \
+ -select 'vc.getGenotype("'$pool1'").getDP() >= '$depth' && vc.getGenotype("'$pool2'").getDP() >= '$depth'' \
+ -O $dir/temp.${id}.realuse.indel.vcf
+
 gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
  VariantsToTable \
  -V $dir/temp.${id}.realuse.vcf \
  -R $genome \
  -F CHROM -F POS -F REF -F ALT -GF AD -GF DP \
  -O $dir/temp.${id}.realuse.gatk.table
 
+gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
+ VariantsToTable \
+ -V $dir/temp.${id}.realuse.indel.vcf \
+ -R $genome \
+ -F CHROM -F POS -F REF -F ALT -GF AD -GF DP \
+ -O $dir/temp.${id}.realuse.indel.gatk.table
+
 if [ -s $dir/temp.${id}.realuse.gatk.table ]; then
  cat $dir/temp.${id}.realuse.gatk.table | awk '$4!~/,/' | tr ',' '\t' | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t";print "#CHROM\tPOSm1\tPOS\tREF\tALT\t"bulk1"_ref\t"bulk1"_alt\t"bulk1"_depth\t"bulk1"_ratio\t"bulk2"_ref\t"bulk2"_alt\t"bulk2"_depth\t"bulk2"_ratio\tsnpindex"}NR>1{print $1,$2-1,$2,$3,$4,$5,$6,$7,$6/$7,$8,$9,$10,$9/$10,$6/$7-$9/$10}' >$output1
  cat $output1 | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t"; print "CHROM\tPOS\tAD_REF."bulk1"\tAD_ALT."bulk1"\tAD_REF."bulk2"\tAD_ALT."bulk2}NR>1{print $1,$3,$6,$7,$10,$11}' >$output2
- if [ -d $dir/temp_output ];then
+ cat $dir/temp.${id}.realuse.indel.gatk.table | awk '$4!~/,/' | tr ',' '\t' | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t";print "#CHROM\tPOSm1\tPOS\tREF\tALT\t"bulk1"_ref\t"bulk1"_alt\t"bulk1"_depth\t"bulk1"_ratio\t"bulk2"_ref\t"bulk2"_alt\t"bulk2"_depth\t"bulk2"_ratio\tsnpindex"}NR>1{print $1,$2-1,$2,$3,$4,$5,$6,$7,$6/$7,$8,$9,$10,$9/$10,$6/$7-$9/$10}' >$output3
+ if [ -d $dir/temp_output ]; then
  rm -rf $dir/temp_output
  fi
  mkdir $dir/temp_output && mv $dir/temp.${id}* $dir/temp_output

diff --git a/tritimap_env.yaml b/tritimap_env.yaml
@@ -12,6 +12,8 @@ dependencies:
  - biopython
  - fastp>=0.20.1
  - bioawk
+ - blast >=2.5.0
+ - bedops >=2.4.39
  - bwa-mem2=2.2.1
  - samtools=1.12
  - seqkit>=0.15.0