Skip to content

Commit

Permalink
new: add candidate region indel information
Browse files Browse the repository at this point in the history
  • Loading branch information
fei0810 committed Apr 8, 2021
1 parent b718b01 commit 1a1181e
Show file tree
Hide file tree
Showing 11 changed files with 65 additions and 22 deletions.
6 changes: 3 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.DS_Store
*/.DS_Store
.DS_Store
*/.DS_Store

.vscode/*
!.vscode/settings.json
Expand All @@ -12,4 +12,4 @@
.history/

meta.yaml
test_dir
test_dir
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,10 @@ STAR --runThreadN 30 --runMode genomeGenerate \
#### Configuration files

```sh
# generate configuration file in running directory
# generate configuration file in current directory
tritimap init

#Or generate configuration file in running directory
tritimap init -d /your/work/path
```

Expand Down
4 changes: 2 additions & 2 deletions docs/manual_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

其中,Interval Mapping Module 和 Assembly Module 为命令行软件,输入为混池测序 DNA-Seq(ChIP-Seq/WGS)或 RNA-Seq 数据,可以一步生成包括性状关联区间、突变位点和新基因在内的多种结果。

Web-based Annotation Module 为在线分析平台。Triti-Map 收集了小麦族各物种和六倍体小麦已有测序品种的基因组信息。统一进行了基因功能注释和转录因子结合位点预测,同时整合了各物种有代表性的表观修饰数据。该平台可以进行包括 SNP 注释与可视化展示、同源基因分析、小麦族共线性区间分析和新序列功能注释在内的多种分析,可以为小麦族基因克隆提供更加丰富的参考信息。
Web-based Annotation Module [在线分析平台](http://bioinfo.sibs.ac.cn/tritimap/)。Triti-Map 收集了小麦族各物种和六倍体小麦已有测序品种的基因组信息。统一进行了基因功能注释和转录因子结合位点预测,同时整合了各物种有代表性的表观修饰数据。该平台可以进行包括 SNP 注释与可视化展示、同源基因分析、小麦族共线性区间分析和新序列功能注释在内的多种分析,可以为小麦族基因克隆提供更加丰富的参考信息。

### Triti-Map 分析流程图

Expand Down Expand Up @@ -533,7 +533,7 @@ seqid hit_db hit_id hit_desc hit_url hsp_bit_score hsp_align_len hsp_identity hs

## 在线注释模块

Triti-Map 的上游分析结果可以配套[在线分析平台](https://github.com/fei0810/Triti-Map)进行后续分析和可视化展示。包括对 SNP 进行功能和表观修饰注释,对基因进行同源基因分析,对性状关联区间进行共线性分析,对特异性片段进行功能注释等。
Triti-Map 的上游分析结果可以配套[在线分析平台](http://bioinfo.sibs.ac.cn/tritimap/)进行后续分析和可视化展示。包括对 SNP 进行功能和表观修饰注释,对基因进行同源基因分析,对性状关联区间进行共线性分析,对特异性片段进行功能注释等。

**在线分析平台主要功能示意图**

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setuptools.setup(
name="tritimap",
version="0.9.0",
version="0.9.2",
author="Fei Zhao",
author_email="zhaofei920810@gmail.com",
url="https://github.com/fei0810/Triti-Map",
Expand Down
2 changes: 2 additions & 0 deletions tritimap/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ if config['module'] == "only_mapping":
input:
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_output.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_raw_region.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_snpinfo.txt"),
Expand All @@ -209,6 +210,7 @@ elif config['module'] == "all":
input:
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_output.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_raw_region.txt"),
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_snpinfo.txt"),
Expand Down
2 changes: 1 addition & 1 deletion tritimap/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os

__version__ = "0.9.0"
__version__ = "0.9.2"

root_dir = os.path.dirname(os.path.abspath(__file__))
19 changes: 18 additions & 1 deletion tritimap/rules/vcf_pretreat.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,35 @@ rule vcf2tab:
scriptdir = script_dir
output:
snpindex = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_snpindex_input.txt"),
indel = temp(join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_indel.txt")),
qtlseqr = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_qtlseqr_input.txt")
message: "\nTransform genotype filterd VCF file to tab format. Input file: {input}\n"
log:
join(dir_path+"/logs", "_".join(samples.bulk.drop_duplicates()) + "_vcf2tab.log")
shell:"""
set +e
bash {params.scriptdir}/vcf2tab.sh {input} {output.snpindex} {output.qtlseqr} {params.pool1} {params.pool2} {params.snpdepth} {params.genome} > {log} 2>&1
bash {params.scriptdir}/vcf2tab.sh {input} {output.snpindex} {output.qtlseqr} {params.pool1} {params.pool2} {params.snpdepth} {params.genome} {output.indel} > {log} 2>&1
exitcode=$?
if [ $exitcode -eq 1 ]
then
exit 1
else
exit 0
fi
"""

rule getRegionIndel:
input:
indel = join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates()) + "_indel.txt"),
region=join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_region.txt")
params:
dirpath = join(dir_path+"/06_regionout")
output:
join(dir_path+"/06_regionout", "_".join(samples.bulk.drop_duplicates())+ "_qtlseqr_filter_indelinfo.txt")
message: "\nGet indel information in candidate region. Input file: {input}\n"
log:
join(dir_path+"/logs", "_".join(samples.bulk.drop_duplicates()) + "_getregionindel.log")
shell:"""
cat {input.region} | grep -v 'start' | cut -f1,3,4 | sort -k1,1 -k2,2n > {params.dirpath}/temp.candidateregion.indel.bed
bedmap --skip-unmapped --delim '\t' --header --echo {input.indel} {params.dirpath}/temp.candidateregion.indel.bed | cat <(head -n1 {input.indel}) - > {output} && rm {params.dirpath}/temp.candidateregion.indel.bed
"""
10 changes: 4 additions & 6 deletions tritimap/scripts/getuniqscaffold.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ bioawk -v len=$length -c fastx '{if(length($seq)>=len){print ">"$name;print $seq

# map to ref
if [ $type == "dna" ]; then
bwa-mem2 mem -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
bwa-mem2 mem -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
elif [ $type == "rna" ]; then
minimap2 -ax splice --split-prefix minimap_${id1} -I $mem $genome ${dir}/temp.${id1}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id1}.scaffolds.500.bam -
minimap2 -ax splice --split-prefix minimap_${id2} -I $mem $genome ${dir}/temp.${id2}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id2}.scaffolds.500.bam -
Expand Down Expand Up @@ -90,7 +90,7 @@ samtools faidx -r ${dir}/temp.${id1}2${id2}_samtools.region.txt $genome >${dir}/

makeblastdb -in ${dir}/temp.${id1}2${id2}_samtools.region.fasta -dbtype nucl

# map to database
# map to database
blastn -db ${dir}/temp.${id1}2${id2}_samtools.region.fasta -query ${dir}/temp.${id1}_step2_candidate.fasta -out ${dir}/temp.${id1}_step3_candidate.outfmt6 -outfmt 6 -num_threads $thread
cut -f1 ${dir}/temp.${id1}_step3_candidate.outfmt6 | sort -u >${dir}/temp.${id1}_step3_candidate.id
seqkit grep -n -f ${dir}/temp.${id1}_step3_candidate.id ${dir}/temp.${id1}_step2_candidate.fasta >${dir}/temp.${id1}_step3_candidate.fasta
Expand All @@ -116,7 +116,6 @@ cat <(awk 'BEGIN{print "seqid\tchrom\tstart\tend\thit_length\thit_score"}') ${di
samtools view -f 4 ${dir}/temp.${id1}.scaffolds.500.bam | awk -v name=${id1} -F "\t" '{print ">"$1":"length($10)":"name"\n"$10}' >${dir}/temp.${id1}_unmap.fasta
samtools view -f 4 ${dir}/temp.${id2}.scaffolds.500.bam | awk -v name=${id2} -F "\t" '{print ">"$1":"length($10)":"name"\n"$10}' >${dir}/temp.${id2}_unmap.fasta


makeblastdb -in ${dir}/temp.${id1}_unmap.fasta -dbtype nucl
makeblastdb -in ${dir}/temp.${id2}_unmap.fasta -dbtype nucl

Expand All @@ -141,8 +140,7 @@ seqkit grep -n -v -f ${dir}/temp.${id2}_unmap_needfilter.id ${dir}/temp.${id2}_u
cp ${dir}/temp.${id1}_step2_unmap.fasta $unmap1
cp ${dir}/temp.${id2}_step2_unmap.fasta $unmap2

if [ -d $dir/temp_output ];then
if [ -d $dir/temp_output ]; then
rm -rf $dir/temp_output
fi
mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output

11 changes: 5 additions & 6 deletions tritimap/scripts/getuniqscaffold_by_fasta.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#
###############################################################################


input1=$(realpath $1)
id1=$(basename $1 | sed 's/_denovo_scaffolds.fasta//')
input2=$(realpath $2)
Expand All @@ -45,8 +44,8 @@ bioawk -v len=$length -c fastx '{if(length($seq)>=len){print ">"$name;print $seq

# map to ref
if [ $type == "dna" ]; then
bwa-mem2 mem -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
bwa-mem2 mem -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id1}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id1}.scaffolds.500.bam -
bwa-mem2 mem -v 1 -t $thread $genome ${dir}/temp.${id2}.scaffolds.500.fasta | samtools view -S -b - | samtools sort -@ $thread -o ${dir}/temp.${id2}.scaffolds.500.bam -
elif [ $type == "rna" ]; then
minimap2 -ax splice --split-prefix minimap_${id1} -I $mem $genome ${dir}/temp.${id1}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id1}.scaffolds.500.bam -
minimap2 -ax splice --split-prefix minimap_${id2} -I $mem $genome ${dir}/temp.${id2}.scaffolds.500.fasta -t 30 | samtools view -S -b - | samtools sort -o ${dir}/temp.${id2}.scaffolds.500.bam -
Expand Down Expand Up @@ -93,7 +92,7 @@ ln -s $database ${dir}/temp.${id1}2${id2}_samtools.region.fasta

makeblastdb -in ${dir}/temp.${id1}2${id2}_samtools.region.fasta -dbtype nucl

# map to database
# map to database
blastn -db ${dir}/temp.${id1}2${id2}_samtools.region.fasta -query ${dir}/temp.${id1}_step2_candidate.fasta -out ${dir}/temp.${id1}_step3_candidate.outfmt6 -outfmt 6 -num_threads $thread
cut -f1 ${dir}/temp.${id1}_step3_candidate.outfmt6 | sort -u >${dir}/temp.${id1}_step3_candidate.id
seqkit grep -n -f ${dir}/temp.${id1}_step3_candidate.id ${dir}/temp.${id1}_step2_candidate.fasta >${dir}/temp.${id1}_step3_candidate.fasta
Expand Down Expand Up @@ -144,7 +143,7 @@ seqkit grep -n -v -f ${dir}/temp.${id2}_unmap_needfilter.id ${dir}/temp.${id2}_u
cp ${dir}/temp.${id1}_step2_unmap.fasta $unmap1
cp ${dir}/temp.${id2}_step2_unmap.fasta $unmap2

if [ -d $dir/temp_output ];then
if [ -d $dir/temp_output ]; then
rm -rf $dir/temp_output
fi
mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output
mkdir ${dir}/temp_output && mv ${dir}/temp.${id1}* ${dir}/temp.${id2}* ${dir}/temp_output
24 changes: 23 additions & 1 deletion tritimap/scripts/vcf2tab.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pool1=$4
pool2=$5
genome=$7
depth=$6
output3=$(realpath $8)

gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
SelectVariants \
Expand All @@ -43,17 +44,38 @@ gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
-select 'vc.getGenotype("'$pool1'").getDP() >= '$depth' && vc.getGenotype("'$pool2'").getDP() >= '$depth'' \
-O $dir/temp.${id}.realuse.vcf

gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
SelectVariants \
-R $genome \
-V $input \
--exclude-filtered \
-select-type INDEL \
--max-nocall-number 0 \
-sn $pool1 \
-sn $pool2 \
-select 'DP >= '$depth'' \
-select 'vc.getGenotype("'$pool1'").getDP() >= '$depth' && vc.getGenotype("'$pool2'").getDP() >= '$depth'' \
-O $dir/temp.${id}.realuse.indel.vcf

gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
VariantsToTable \
-V $dir/temp.${id}.realuse.vcf \
-R $genome \
-F CHROM -F POS -F REF -F ALT -GF AD -GF DP \
-O $dir/temp.${id}.realuse.gatk.table

gatk --java-options "-Xmx10G -Djava.io.tmpdir=./" \
VariantsToTable \
-V $dir/temp.${id}.realuse.indel.vcf \
-R $genome \
-F CHROM -F POS -F REF -F ALT -GF AD -GF DP \
-O $dir/temp.${id}.realuse.indel.gatk.table

if [ -s $dir/temp.${id}.realuse.gatk.table ]; then
cat $dir/temp.${id}.realuse.gatk.table | awk '$4!~/,/' | tr ',' '\t' | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t";print "#CHROM\tPOSm1\tPOS\tREF\tALT\t"bulk1"_ref\t"bulk1"_alt\t"bulk1"_depth\t"bulk1"_ratio\t"bulk2"_ref\t"bulk2"_alt\t"bulk2"_depth\t"bulk2"_ratio\tsnpindex"}NR>1{print $1,$2-1,$2,$3,$4,$5,$6,$7,$6/$7,$8,$9,$10,$9/$10,$6/$7-$9/$10}' >$output1
cat $output1 | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t"; print "CHROM\tPOS\tAD_REF."bulk1"\tAD_ALT."bulk1"\tAD_REF."bulk2"\tAD_ALT."bulk2}NR>1{print $1,$3,$6,$7,$10,$11}' >$output2
if [ -d $dir/temp_output ];then
cat $dir/temp.${id}.realuse.indel.gatk.table | awk '$4!~/,/' | tr ',' '\t' | awk -v bulk1=$pool1 -v bulk2=$pool2 'BEGIN{OFS="\t";print "#CHROM\tPOSm1\tPOS\tREF\tALT\t"bulk1"_ref\t"bulk1"_alt\t"bulk1"_depth\t"bulk1"_ratio\t"bulk2"_ref\t"bulk2"_alt\t"bulk2"_depth\t"bulk2"_ratio\tsnpindex"}NR>1{print $1,$2-1,$2,$3,$4,$5,$6,$7,$6/$7,$8,$9,$10,$9/$10,$6/$7-$9/$10}' >$output3
if [ -d $dir/temp_output ]; then
rm -rf $dir/temp_output
fi
mkdir $dir/temp_output && mv $dir/temp.${id}* $dir/temp_output
Expand Down
2 changes: 2 additions & 0 deletions tritimap_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ dependencies:
- biopython
- fastp>=0.20.1
- bioawk
- blast >=2.5.0
- bedops >=2.4.39
- bwa-mem2=2.2.1
- samtools=1.12
- seqkit>=0.15.0
Expand Down

0 comments on commit 1a1181e

Please sign in to comment.