From 1a1181ebf7f70ef34394844eedf510099f6cd96f Mon Sep 17 00:00:00 2001
From: feizhao <zhaofei920810@gmail.com>
Date: Thu, 8 Apr 2021 11:01:13 +0800
Subject: [PATCH] new: add candidate region indel information

---
 .gitignore                                   |  6 ++---
 README.md                                    |  5 +++-
 docs/manual_zh.md                            |  4 ++--
 setup.py                                     |  2 +-
 tritimap/Snakefile                           |  2 ++
 tritimap/__init__.py                         |  2 +-
 tritimap/rules/vcf_pretreat.smk              | 19 +++++++++++++++-
 tritimap/scripts/getuniqscaffold.sh          | 10 ++++----
 tritimap/scripts/getuniqscaffold_by_fasta.sh | 11 ++++-----
 tritimap/scripts/vcf2tab.sh                  | 24 +++++++++++++++++++-
 tritimap_env.yaml                            |  2 ++
 11 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index 58261b0..30451d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
-.DS_Store 
-*/.DS_Store 
+.DS_Store
+*/.DS_Store
 
 .vscode/*
 !.vscode/settings.json
@@ -12,4 +12,4 @@
 .history/
 
 meta.yaml
-test_dir
\ No newline at end of file
+test_dir
diff --git a/README.md b/README.md
index cfcd56d..61d0209 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,10 @@ STAR --runThreadN 30 --runMode genomeGenerate \
 #### Configuration files
 
 ```sh
-# generate configuration file in running directory
+# generate configuration file in current directory
+tritimap init
+
+#Or generate configuration file in running directory
 tritimap init -d /your/work/path
 ```
 
diff --git a/docs/manual_zh.md b/docs/manual_zh.md
index 6fd172a..bdf3f6c 100644
--- a/docs/manual_zh.md
+++ b/docs/manual_zh.md
@@ -16,7 +16,7 @@
 
 其中，Interval Mapping Module 和 Assembly Module 为命令行软件，输入为混池测序 DNA-Seq（ChIP-Seq/WGS）或 RNA-Seq 数据，可以一步生成包括性状关联区间、突变位点和新基因在内的多种结果。
 
-Web-based Annotation Module 为在线分析平台。Triti-Map 收集了小麦族各物种和六倍体小麦已有测序品种的基因组信息。统一进行了基因功能注释和转录因子结合位点预测，同时整合了各物种有代表性的表观修饰数据。该平台可以进行包括 SNP 注释与可视化展示、同源基因分析、小麦族共线性区间分析和新序列功能注释在内的多种分析，可以为小麦族基因克隆提供更加丰富的参考信息。
+Web-based Annotation Module 为[在线分析平台](http://bioinfo.sibs.ac.cn/tritimap/)。Triti-Map 收集了小麦族各物种和六倍体小麦已有测序品种的基因组信息。统一进行了基因功能注释和转录因子结合位点预测，同时整合了各物种有代表性的表观修饰数据。该平台可以进行包括 SNP 注释与可视化展示、同源基因分析、小麦族共线性区间分析和新序列功能注释在内的多种分析，可以为小麦族基因克隆提供更加丰富的参考信息。
 
 ### Triti-Map 分析流程图
 
@@ -533,7 +533,7 @@ seqid	hit_db	hit_id	hit_desc	hit_url	hsp_bit_score	hsp_align_len	hsp_identity	hs
 
 ## 在线注释模块
 
-Triti-Map 的上游分析结果可以配套[在线分析平台](https://github.com/fei0810/Triti-Map)进行后续分析和可视化展示。包括对 SNP 进行功能和表观修饰注释，对基因进行同源基因分析，对性状关联区间进行共线性分析，对特异性片段进行功能注释等。
+Triti-Map 的上游分析结果可以配套[在线分析平台](http://bioinfo.sibs.ac.cn/tritimap/)进行后续分析和可视化展示。包括对 SNP 进行功能和表观修饰注释，对基因进行同源基因分析，对性状关联区间进行共线性分析，对特异性片段进行功能注释等。
 
 **在线分析平台主要功能示意图**
 
diff --git a/setup.py b/setup.py
index eced6c4..b46dd12 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
 
 setuptools.setup(
     name="tritimap",
-    version="0.9.0",
+    version="0.9.2",
     author="Fei Zhao",
     author_email="zhaofei920810@gmail.com",
     url="https://github.com/fei0810/Triti-Map",
diff --git a/tritimap/Snakefile b/tritimap/Snakefile
index 8cab78b..a6500bf 100644
--- a/tritimap/Snakefile
+++ b/tritimap/Snakefile
@@ -185,6 +185,7 @@ if config['module'] == "only_mapping":
         input:
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_output.txt"),
+            join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt"),
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt"),
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_raw_region.txt"),
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_snpinfo.txt"),
@@ -209,6 +210,7 @@ elif config['module'] == "all":
         input:
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_output.txt"),
+            join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt"),
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt"),
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_raw_region.txt"),
             join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_snpinfo.txt"),
diff --git a/tritimap/__init__.py b/tritimap/__init__.py
index d636ec7..8531c7e 100644
--- a/tritimap/__init__.py
+++ b/tritimap/__init__.py
@@ -1,5 +1,5 @@
 import os
 
-__version__ = "0.9.0"
+__version__ = "0.9.2"
 
 root_dir = os.path.dirname(os.path.abspath(__file__))
diff --git a/tritimap/rules/vcf_pretreat.smk b/tritimap/rules/vcf_pretreat.smk
index 65fd134..8f97fba 100644
--- a/tritimap/rules/vcf_pretreat.smk
+++ b/tritimap/rules/vcf_pretreat.smk
@@ -9,13 +9,14 @@ rule vcf2tab:
         scriptdir = script_dir
     output:
         snpindex = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
+        indel = temp(join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_indel.txt")),
         qtlseqr = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_qtlseqr_input.txt")
     message: "\nTransform genotype filterd VCF file to tab format. Input file: {input}\n"
     log:
         join(dir_path+"/logs", "_".join(samples.bulk.drop_duplicates()) + "_vcf2tab.log")
     shell:"""
     set +e
-    bash {params.scriptdir}/vcf2tab.sh {input} {output.snpindex} {output.qtlseqr} {params.pool1} {params.pool2} {params.snpdepth} {params.genome} > {log} 2>&1
+    bash {params.scriptdir}/vcf2tab.sh {input} {output.snpindex} {output.qtlseqr} {params.pool1} {params.pool2} {params.snpdepth} {params.genome} {output.indel} > {log} 2>&1
     exitcode=$?
     if [ $exitcode -eq 1 ]
     then
@@ -23,4 +24,20 @@ rule vcf2tab:
     else
         exit 0
     fi
+    """
+
+rule getRegionIndel:
+    input:
+        indel = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_indel.txt"),
+        region=join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt")
+    params:
+        dirpath = join(dir_path+"/06_regionout")
+    output:
+        join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt")
+    message: "\nGet indel information in candidate region. Input file: {input}\n"
+    log:
+        join(dir_path+"/logs", "_".join(samples.bulk.drop_duplicates()) + "_getregionindel.log")
+    shell:"""
+    cat {input.region} | grep -v 'start' | cut -f1,3,4 | sort -k1,1 -k2,2n > {params.dirpath}/temp.candidateregion.indel.bed
+    bedmap --skip-unmapped --delim '\t' --header --echo {input.indel} {params.dirpath}/temp.candidateregion.indel.bed | cat <(head -n1 {input.indel}) - > {output} && rm {params.dirpath}/temp.candidateregion.indel.bed
     """
\ No newline at end of file
diff --git a/tritimap/scripts/getuniqscaffold.sh b/tritimap/scripts/getuniqscaffold.sh
index 9b9dd69..af7890c 100755
--- a/tritimap/scripts/getuniqscaffold.sh
+++ b/tritimap/scripts/getuniqscaffold.sh
@@ -44,8 +44,8 @@ bioawk -v len=$length -c fastx '{if(length($seq)>=len){print ">"$name;print $seq
 
 # map to ref
 if [ $type == "dna" ]; then
-	bwa-mem2 mem -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
-	bwa-mem2 mem -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
+	bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
+	bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
 elif [ $type == "rna" ]; then
 	minimap2 -ax splice --split-prefix minimap_${id1} -I $mem $genome ${dir}/temp.${id1}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id1}.scaffolds.500.bam -
 	minimap2 -ax splice --split-prefix minimap_${id2} -I $mem $genome ${dir}/temp.${id2}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id2}.scaffolds.500.bam -
@@ -90,7 +90,7 @@ samtools faidx -r ${dir}/temp.${id1}2${id2}_samtools.region.txt $genome >${dir}/
 
 makeblastdb -in ${dir}/temp.${id1}2${id2}_samtools.region.fasta -dbtype nucl
 
-# map to database 
+# map to database
 blastn -db ${dir}/temp.${id1}2${id2}_samtools.region.fasta -query ${dir}/temp.${id1}_step2_candidate.fasta -out ${dir}/temp.${id1}_step3_candidate.outfmt6 -outfmt 6 -num_threads $thread
 cut -f1 ${dir}/temp.${id1}_step3_candidate.outfmt6 | sort -u >${dir}/temp.${id1}_step3_candidate.id
 seqkit grep -n -f ${dir}/temp.${id1}_step3_candidate.id ${dir}/temp.${id1}_step2_candidate.fasta >${dir}/temp.${id1}_step3_candidate.fasta
@@ -116,7 +116,6 @@ cat <(awk 'BEGIN{print "seqid\tchrom\tstart\tend\thit_length\thit_score"}') ${di
 samtools view -f 4 ${dir}/temp.${id1}.scaffolds.500.bam | awk -v name=${id1} -F "\t" '{print ">"$1":"length($10)":"name"\n"$10}' >${dir}/temp.${id1}_unmap.fasta
 samtools view -f 4 ${dir}/temp.${id2}.scaffolds.500.bam | awk -v name=${id2} -F "\t" '{print ">"$1":"length($10)":"name"\n"$10}' >${dir}/temp.${id2}_unmap.fasta
 
-
 makeblastdb -in ${dir}/temp.${id1}_unmap.fasta -dbtype nucl
 makeblastdb -in ${dir}/temp.${id2}_unmap.fasta -dbtype nucl
 
@@ -141,8 +140,7 @@ seqkit grep -n -v -f ${dir}/temp.${id2}_unmap_needfilter.id ${dir}/temp.${id2}_u
 cp ${dir}/temp.${id1}_step2_unmap.fasta $unmap1
 cp ${dir}/temp.${id2}_step2_unmap.fasta $unmap2
 
-if [ -d $dir/temp_output ];then
+if [ -d $dir/temp_output ]; then
 	rm -rf $dir/temp_output
 fi
 mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output
-
diff --git a/tritimap/scripts/getuniqscaffold_by_fasta.sh b/tritimap/scripts/getuniqscaffold_by_fasta.sh
index 28e6c5b..2037480 100755
--- a/tritimap/scripts/getuniqscaffold_by_fasta.sh
+++ b/tritimap/scripts/getuniqscaffold_by_fasta.sh
@@ -19,7 +19,6 @@
 #
 ###############################################################################
 
-
 input1=$(realpath $1)
 id1=$(basename $1 | sed 's/_denovo_scaffolds.fasta//')
 input2=$(realpath $2)
@@ -45,8 +44,8 @@ bioawk -v len=$length -c fastx '{if(length($seq)>=len){print ">"$name;print $seq
 
 # map to ref
 if [ $type == "dna" ]; then
-	bwa-mem2 mem -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
-	bwa-mem2 mem -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
+	bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
+	bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
 elif [ $type == "rna" ]; then
 	minimap2 -ax splice --split-prefix minimap_${id1} -I $mem $genome ${dir}/temp.${id1}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id1}.scaffolds.500.bam -
 	minimap2 -ax splice --split-prefix minimap_${id2} -I $mem $genome ${dir}/temp.${id2}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id2}.scaffolds.500.bam -
@@ -93,7 +92,7 @@ ln -s $database ${dir}/temp.${id1}2${id2}_samtools.region.fasta
 
 makeblastdb -in ${dir}/temp.${id1}2${id2}_samtools.region.fasta -dbtype nucl
 
-# map to database 
+# map to database
 blastn -db ${dir}/temp.${id1}2${id2}_samtools.region.fasta -query ${dir}/temp.${id1}_step2_candidate.fasta -out ${dir}/temp.${id1}_step3_candidate.outfmt6 -outfmt 6 -num_threads $thread
 cut -f1 ${dir}/temp.${id1}_step3_candidate.outfmt6 | sort -u >${dir}/temp.${id1}_step3_candidate.id
 seqkit grep -n -f ${dir}/temp.${id1}_step3_candidate.id ${dir}/temp.${id1}_step2_candidate.fasta >${dir}/temp.${id1}_step3_candidate.fasta
@@ -144,7 +143,7 @@ seqkit grep -n -v -f ${dir}/temp.${id2}_unmap_needfilter.id ${dir}/temp.${id2}_u
 cp ${dir}/temp.${id1}_step2_unmap.fasta $unmap1
 cp ${dir}/temp.${id2}_step2_unmap.fasta $unmap2
 
-if [ -d $dir/temp_output ];then
+if [ -d $dir/temp_output ]; then
 	rm -rf $dir/temp_output
 fi
-mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output
\ No newline at end of file
+mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output
diff --git a/tritimap/scripts/vcf2tab.sh b/tritimap/scripts/vcf2tab.sh
index d76bea6..f0a0847 100755
--- a/tritimap/scripts/vcf2tab.sh
+++ b/tritimap/scripts/vcf2tab.sh
@@ -28,6 +28,7 @@ pool1=$4
 pool2=$5
 genome=$7
 depth=$6
+output3=$(realpath $8)
 
 gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
 	SelectVariants \
@@ -43,6 +44,19 @@ gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
 	-select 'vc.getGenotype("'$pool1'").getDP() >= '$depth' && vc.getGenotype("'$pool2'").getDP() >= '$depth'' \
 	-O $dir/temp.${id}.realuse.vcf
 
+gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
+	SelectVariants \
+	-R $genome \
+	-V $input \
+	--exclude-filtered \
+	-select-type INDEL \
+	--max-nocall-number 0 \
+	-sn $pool1 \
+	-sn $pool2 \
+	-select 'DP >= '$depth'' \
+	-select 'vc.getGenotype("'$pool1'").getDP() >= '$depth' && vc.getGenotype("'$pool2'").getDP() >= '$depth'' \
+	-O $dir/temp.${id}.realuse.indel.vcf
+
 gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
 	VariantsToTable \
 	-V $dir/temp.${id}.realuse.vcf \
@@ -50,10 +64,18 @@ gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
 	-F CHROM -F POS -F REF -F ALT -GF AD -GF DP \
 	-O $dir/temp.${id}.realuse.gatk.table
 
+gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
+	VariantsToTable \
+	-V $dir/temp.${id}.realuse.indel.vcf \
+	-R $genome \
+	-F CHROM -F POS -F REF -F ALT -GF AD -GF DP \
+	-O $dir/temp.${id}.realuse.indel.gatk.table
+
 if [ -s $dir/temp.${id}.realuse.gatk.table ]; then
 	cat $dir/temp.${id}.realuse.gatk.table | awk '$4!~/,/' | tr ',' '\t' | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t";print "#CHROM\tPOSm1\tPOS\tREF\tALT\t"bulk1"_ref\t"bulk1"_alt\t"bulk1"_depth\t"bulk1"_ratio\t"bulk2"_ref\t"bulk2"_alt\t"bulk2"_depth\t"bulk2"_ratio\tsnpindex"}NR>1{print $1,$2-1,$2,$3,$4,$5,$6,$7,$6/$7,$8,$9,$10,$9/$10,$6/$7-$9/$10}' >$output1
 	cat $output1 | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t"; print "CHROM\tPOS\tAD_REF."bulk1"\tAD_ALT."bulk1"\tAD_REF."bulk2"\tAD_ALT."bulk2}NR>1{print $1,$3,$6,$7,$10,$11}' >$output2
-	if [ -d $dir/temp_output ];then
+	cat $dir/temp.${id}.realuse.indel.gatk.table | awk '$4!~/,/' | tr ',' '\t' | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t";print "#CHROM\tPOSm1\tPOS\tREF\tALT\t"bulk1"_ref\t"bulk1"_alt\t"bulk1"_depth\t"bulk1"_ratio\t"bulk2"_ref\t"bulk2"_alt\t"bulk2"_depth\t"bulk2"_ratio\tsnpindex"}NR>1{print $1,$2-1,$2,$3,$4,$5,$6,$7,$6/$7,$8,$9,$10,$9/$10,$6/$7-$9/$10}' >$output3
+	if [ -d $dir/temp_output ]; then
 		rm -rf $dir/temp_output
 	fi
 	mkdir $dir/temp_output && mv $dir/temp.${id}* $dir/temp_output
diff --git a/tritimap_env.yaml b/tritimap_env.yaml
index f3cd5ec..ae5ff08 100644
--- a/tritimap_env.yaml
+++ b/tritimap_env.yaml
@@ -12,6 +12,8 @@ dependencies:
   - biopython
   - fastp>=0.20.1
   - bioawk
+  - blast >=2.5.0
+  - bedops >=2.4.39
   - bwa-mem2=2.2.1
   - samtools=1.12
   - seqkit>=0.15.0