diff --git a/Examples/arcs-make b/Examples/arcs-make deleted file mode 100755 index 04596a7..0000000 --- a/Examples/arcs-make +++ /dev/null @@ -1,312 +0,0 @@ -#!/usr/bin/make -f -# Pipeline for the ARCS program -# Written by Jeffrey Tse -#Default Parameters - -# Input Names -draft=draft -reads=reads - -# Find the complete long read file name -fastq=$(shell test -f $(reads).fq.gz && echo "true") -fasta=$(shell test -f $(reads).fa.gz && echo "true") -ifeq ($(fastq), true) -long_reads=$(reads).fq.gz -endif -ifeq ($(fasta), true) -long_reads=$(reads).fa.gz -endif - -# tigmint Parameters -minsize=2000 -as=0.65 -nm=5 -dist=50000 -mapq=0 -trim=0 -span=20 -window=1000 - -# ARCS ARKS Common Parameters -barcode_counts=barcodeMultiplicityArcs -c=5 -m=50-10000 -z=500 -s=98 -r=0.05 -e=30000 -D=false -dist_upper=false -d=0 -gap=100 -B=20 - -# ARCS Specific Parameters -s=98 -cut=250 - -# ARKS Specific Parameters -j=0.55 -k=30 -t=8 - -# LINKS Parameters -l=5 -a=0.3 -bin=$(dir $(realpath $(firstword $(MAKEFILE_LIST)))) - -# Control minimap2 index split parameter (-I) -minimap2_index_size=50G - -SHELL=bash -e -o pipefail -ifeq ($(shell zsh -e -o pipefail -c 'true' 2>/dev/null; echo $$?), 0) -#Set pipefail to ensure that all commands of a pipe succeed. -SHELL=zsh -e -o pipefail -# Report run time and memory usage with zsh. -export REPORTTIME=1 -export TIMEFMT=time user=%U system=%S elapsed=%E cpu=%P memory=%M job=%J -endif - -# Use pigz or bgzip for parallel compression if available. -ifneq ($(shell command -v pigz),) -gzip=pigz -p$t -else -ifneq ($(shell command -v bgzip),) -gzip=bgzip -@$t -else -gzip=gzip -endif -endif - -# Record run time and memory usage in a file using GNU time. -ifdef time -ifneq ($(shell command -v gtime),) -gtime=command gtime -v -o $@.time -else -gtime=command time -v -o $@.time -endif -endif - - -.PHONY: all version help clean tigmint arcs arcs-tigmint arcs-with-tigmint arks arks-tigmint arks-with-tigmint -.DELETE_ON_ERROR: -.SECONDARY: - - -all: help -# Help -help: - @echo "Usage: ./arcs-make [COMMAND] [OPTION=VALUE]..." - @echo " Commands:" - @echo "" - @echo " arcs run arcs in default mode only, skipping tigmint" - @echo " arcs-tigmint run tigmint, and run arcs in default mode with the output of tigmint" - @echo " arcs-long run arcs in default mode only, using long instead of linked reads, skipping tigmint" - @echo " arks run arcs in kmer mode only, skipping tigmint" - @echo " arks-tigmint run tigmint, and run arcs in kmer mode with the output of tigmint" - @echo " arks-long run arcs in kmer mode only, using long instead of linked reads, skipping tigmint" - @echo " help display this help page" - @echo " version display the software version" - @echo " clean remove intermediate files" - @echo "" - @echo " General Options:" - @echo "" - @echo " draft draft name [draft]. File must have .fasta or .fa extension" - @echo " reads read name [reads]. File must have .fastq.gz or .fq.gz extension" - @echo " time logs time and memory usage to file for main steps (Set to 1 to enable logging)" - @echo "" - @echo " bwa Options:" - @echo "" - @echo " t number of threads used [8]" - @echo "" - @echo " Tigmint Options:" - @echo "" - @echo " minsize minimum molecule size [2000]" - @echo " as minimum AS/read length ratio [0.65]" - @echo " nm maximum number of mismatches [5]" - @echo " dist max dist between reads to be considered the same molecule [50000]" - @echo " mapq mapping quality threshold [0]" - @echo " trim bp of contigs to trim after cutting at error [0]" - @echo " span min number of spanning molecules to be considered assembled [20]" - @echo " window window size for checking spanning molecules [1000]" - @echo "" - @echo " Common Options:" - @echo "" - @echo " c minimum aligned read pairs per barcode mapping [5]" - @echo " m barcode multiplicity range [50-10000]" - @echo " z minimum contig length [500]" - @echo " r p-value for head/tail assigment and link orientation [0.05]" - @echo " e contig head/tail length for masking aligments [30000]" - @echo " D enable distance estimation [false]" - @echo " dist_upper use upper bound distance over median distance [false]" - @echo " B estimate distance using N closest Jaccard scores [20]" - @echo " d max node degree in scaffold graph [0]" - @echo " gap fixed gap size for dist.gv file [100]" - @echo " barcode_counts name of output barcode multiplicity TSV file [barcodeMultiplicityArcs]" - @echo " cut cut length for long reads (for arcs-long and arks-long only) [250]" - @echo "" - @echo " ARCS Specific Options:" - @echo " s minimum sequence identity [98]" - @echo "" - @echo " ARKS Specific Options:" - @echo " j minimum fraction of read kmers matching a contigId [0.55]" - @echo " k size of a k-mer [30]" - @echo " t number of threads [8]" - @echo "" - @echo " LINKS Options:" - @echo "" - @echo " l minimum number of links to compute scaffold [5]" - @echo " a maximum link ratio between two best contig pairs [0.3]" - @echo "" - @echo "Example: To run tigmint and arcs with myDraft.fa, myReads.fq.gz, and a custom multiplicty range, run:" - @echo " ./arcs-make arcs-tigmint draft=myDraft reads=myReads m=[User defined multiplicty range]" - @echo "To ensure that the pipeline runs correctly, make sure that the following tools are in your PATH: bwa, tigmint, samtools, arcs (>= v1.1.0), LINKS (>= v1.8.6)" - -clean: - rm -f *.amb *.ann *.bwt *.pac *.sa *.dist.gv *.fai *.bed *.molecule.tsv *.sortbx.bam - @echo "Clean Done" - -version: - @echo "arcs-make v1.2.2" - -#Preprocessing - -# Create a .fa file that is soft linked to .fasta -%.fa: %.fasta - ln -s $^ $@ - -# Create a .fq.gz file that is soft linked to .fastq.gz -%.fq.gz: %.fastq.gz - ln -s $^ $@ - - -#Run Tigmint -arcs-tigmint: tigmint arcs-with-tigmint -arks-tigmint: tigmint arks-with-tigmint - -# Main -tigmint: $(draft).tigmint.fa -# Run tigmint -$(draft).tigmint.fa: $(draft).fa $(reads).fq.gz - $(gtime) tigmint tigmint draft=$(draft) reads=$(reads) minsize=$(minsize) as=$(as) nm=$(nm) dist=$(dist) mapq=$(mapq) trim=$(trim) span=$(span) window=$(window) t=$t - -#Pre-processing long reads; cut into shorter segments (pseudo-linked reads) -$(reads).cut$(cut).fq.gz: $(long_reads) - $(gtime) sh -c '$(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 $(long_reads) | $(gzip) > $@' - - -#Run ARCS -arcs: $(draft)_c$c_m$m_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa -arcs-with-tigmint: $(draft).tigmint_c$c_m$m_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa -arcs-long: $(draft)_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa -arks: $(draft)_c$c_m$m_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa -arks-with-tigmint: $(draft).tigmint_c$c_m$m_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa -arks-long: $(draft)_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa - -# Convert Scaffold Names into Numerical Numbers -%.renamed.fa: %.fa - perl -ne 'chomp; if(/>/){$$ct+=1; print ">$$ct\n";}else{print "$$_\n";} ' < $^ > $@ - -# Make bwa index from Draft Assembly -%.renamed.fa.bwt: %.renamed.fa - $(gtime) bwa index $^ - -# Use bwa mem to Align Reads to Draft Assembly and Sort it -%.sorted.bam: %.renamed.fa $(reads).fq.gz %.renamed.fa.bwt - $(gtime) sh -c 'bwa mem -t$t -C -p $< $(reads).fq.gz | samtools view -Sb - | samtools sort -@$t -n - -o $@' - - -# Create an fof File Containing the bam File -%_bamfiles.fof: %.sorted.bam - echo $^ > $@ - -# Run ARCS Program -%_c$c_m$m_s$s_r$r_e$e_z$z_original.gv: %.renamed.fa %_bamfiles.fof -ifneq ($D, true) - $(gtime) arcs -f $< -a $(word 2,$^) -v -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv -else ifneq ($(dist_upper), true) - $(gtime) arcs -D -B $B -v -f $< -a $(word 2,$^) -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv -else - $(gtime) arcs -D -B $B --dist_upper -v -f $< -a $(word 2,$^) -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv -endif - -# long -%_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_original.gv: %.renamed.fa $(reads).cut$(cut).fq.gz -ifneq ($D, true) - $(gtime) minimap2 -ax map-ont -y -t$t --secondary=no -I $(minimap2_index_size) $< $(reads).cut$(cut).fq.gz | abyss-fixmate-ssq --all --qname | \ - samtools view -Sb - | samtools sort -@$t -O SAM -n - -o - | \ - arcs -f $< -v -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv /dev/stdin -else ifneq ($(dist_upper), true) - $(gtime) minimap2 -ax map-ont -y -t$t --secondary=no -I $(minimap2_index_size) $< $(reads).cut$(cut).fq.gz | abyss-fixmate-ssq --all --qname | \ - samtools view -Sb - | samtools sort -@$t -O SAM -n - -o - | \ - arcs -D -B $B -v -f $< -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv /dev/stdin -else - $(gtime) minimap2 -ax map-ont -y -t$t --secondary=no -I $(minimap2_index_size) $< $(reads).cut$(cut).fq.gz | abyss-fixmate-ssq --all --qname | \ - samtools view -Sb - | samtools sort -@$t -O SAM -n - -o - | \ - arcs -D -B $B --dist_upper -v -f $< -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv /dev/stdin -endif - -# kmer -%_c$c_m$m_k$k_r$r_e$e_z$z_original.gv: %.renamed.fa $(reads).fq.gz -ifneq ($D, true) - $(gtime) arcs --arks -v -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) $(word 2,$^) --barcode-counts $(barcode_counts).tsv - -else ifneq ($(dist_upper), true) - $(gtime) arcs --arks -v -D -B $B -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) $(word 2,$^) --barcode-counts $(barcode_counts).tsv -else - $(gtime) arcs --arks -v -D -B $B --dist_upper -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) $(word 2,$^) --barcode-counts $(barcode_counts).tsv -endif - -# long kmer -$(reads).barcode-multiplicity.tsv: $(long_reads) - $(gtime) $(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 --bx-only -b $@ $< - -%_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_original.gv: %.renamed.fa $(long_reads) $(reads).barcode-multiplicity.tsv -ifneq ($D, true) - $(gtime) $(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 $(long_reads) |\ - arcs --arks -v -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) -u $(reads).barcode-multiplicity.tsv /dev/stdin - -else ifneq ($(dist_upper), true) - $(gtime) $(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 $(long_reads) |\ - arcs --arks -v -D -B $B -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) -u $(reads).barcode-multiplicity.tsv /dev/stdin -else - $(gtime) $(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 $(long_reads) |\ - arcs --arks -v -D -B $B --dist_upper -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) -u $(reads).barcode-multiplicity.tsv /dev/stdin -endif - -# Generate TSV from ARCS -# default -%_c$c_m$m_s$s_r$r_e$e_z$z.tigpair_checkpoint.tsv: %_c$c_m$m_s$s_r$r_e$e_z$z_original.gv %.renamed.fa - python $(bin)../Examples/makeTSVfile.py $< $@ $(word 2,$^) -# long -%_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z.tigpair_checkpoint.tsv: %_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_original.gv %.renamed.fa - python $(bin)../Examples/makeTSVfile.py $< $@ $(word 2,$^) -# kmer -%_c$c_m$m_k$k_r$r_e$e_z$z.tigpair_checkpoint.tsv: %_c$c_m$m_k$k_r$r_e$e_z$z_original.gv %.renamed.fa - python $(bin)../Examples/makeTSVfile.py $< $@ $(word 2,$^) -# long kmer -%_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z.tigpair_checkpoint.tsv: %_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_original.gv %.renamed.fa - python $(bin)../Examples/makeTSVfile.py $< $@ $(word 2,$^) - -# Adds a and l paramters to the filename -%_z$z_l$l_a$a.tigpair_checkpoint.tsv: %_z$z.tigpair_checkpoint.tsv - ln -sf $^ $@ - -# Make an Empty fof File -empty.fof: - touch $@ - -# Run LINKS -# default -%_c$c_m$m_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.renamed.fa empty.fof %_c$c_m$m_s$s_r$r_e$e_z$z_l$l_a$a.tigpair_checkpoint.tsv - $(gtime) LINKS -f $< -s empty.fof -b $(patsubst %.scaffolds.fa,%,$@) -l $l -a $a -z $z -# long -%_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.renamed.fa empty.fof %_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.tigpair_checkpoint.tsv - $(gtime) LINKS -f $< -s empty.fof -b $(patsubst %.scaffolds.fa,%,$@) -l $l -a $a -z $z -# kmer -%_c$c_m$m_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.renamed.fa empty.fof %_c$c_m$m_k$k_r$r_e$e_z$z_l$l_a$a.tigpair_checkpoint.tsv - $(gtime) LINKS -f $< -s empty.fof -b $(patsubst %.scaffolds.fa,%,$@) -l $l -a $a -z $z -# long kmer -%_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.renamed.fa empty.fof %_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_l$l_a$a.tigpair_checkpoint.tsv - $(gtime) LINKS -f $< -s empty.fof -b $(patsubst %.scaffolds.fa,%,$@) -l $l -a $a -z $z - diff --git a/Examples/arcs-make b/Examples/arcs-make new file mode 120000 index 0000000..343ba6c --- /dev/null +++ b/Examples/arcs-make @@ -0,0 +1 @@ +../bin/arcs-make \ No newline at end of file diff --git a/Examples/makeTSVfile.py b/Examples/makeTSVfile.py deleted file mode 100644 index 3d2c347..0000000 --- a/Examples/makeTSVfile.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env python -""" -Create a tigpair_checkpoint file from ARCS output that LINKS can utilize -""" -import re -import argparse - -index2scaff_name = {} -links_numbering = {} - -def readGraphFile(infile): - """Read ARCS graph file output (.gv)""" - with open(infile, 'r') as f: - for line in f: - test = re.match(r"(\d+)\s+\[id=\"?([^\]\"]+)\"?\]", line.rstrip()) - if test: - index = test.group(1) - scaff_name = test.group(2) - if scaff_name not in links_numbering: - index2scaff_name[index] = scaff_name - -def makeLinksNumbering(scaffolds_fasta): - """""" - counter = 0 - with open(scaffolds_fasta, 'r') as f: - for line in f: - if line[0] == ">": - seq_id = line.rstrip().split()[0][1:] - counter += 1 - links_numbering[seq_id] = str(counter) - - -def writeTSVFile(infile, outfile): - """Create tigpair_checkpoint file""" - with open(infile, 'r') as fin: - with open(outfile, 'w') as fout: - for line in fin: - test = re.search(r"(\d+)--(\d+)\s+\[label=(\d+), weight=(\d+)", line.rstrip()) - if test: - scaffa = index2scaff_name[test.group(1)] - scaffb = index2scaff_name[test.group(2)] - label = int(test.group(3)) - links = int(test.group(4)) - - if scaffa > scaffb: - scaffa, scaffb = scaffb, scaffa - - out_scaffa = "" - out_scaffb = "" - rout_scaffa = "" - rout_scaffb = "" - out_scaffa = out_scaffb = rout_scaffa = rout_scaffb = "" - - # HH : rA->B or rB->A - if label == 0: - out_scaffa = "r" + links_numbering[scaffa] - out_scaffb = "f" + links_numbering[scaffb] - - rout_scaffb = "r" + links_numbering[scaffb] - rout_scaffa = "f" + links_numbering[scaffa] - # HT : rA->rB or B->A - elif label == 1: - out_scaffa = "r" + links_numbering[scaffa] - out_scaffb = "r" + links_numbering[scaffb] - - rout_scaffb = "f" + links_numbering[scaffb] - rout_scaffa = "f" + links_numbering[scaffa] - # TH : A->B or rB->rA - elif label == 2: - out_scaffa = "f" + links_numbering[scaffa] - out_scaffb = "f" + links_numbering[scaffb] - - rout_scaffb = "r" + links_numbering[scaffb] - rout_scaffa = "r" + links_numbering[scaffa] - # TT : A->rB or B->rA - elif label == 3: - out_scaffa = "f" + links_numbering[scaffa] - out_scaffb = "r" + links_numbering[scaffb] - - rout_scaffb = "f" + links_numbering[scaffb] - rout_scaffa = "r" + links_numbering[scaffa] - - match = re.search(r"d=(\d+)", line.rstrip()) - if match: - dist = int(match.group(1)) - est_dist = True - else: - dist = 100 - est_dist = False - - if dist < 0: - dist_category = -1 - elif dist == 100 and est_dist is False: - dist_category = 10 - elif dist < 500: - dist_category = 500 - elif dist < 1000: - dist_category = 1000 - elif dist < 5000: - dist_category = 5000 - else: - dist_category = 10000 - - - gap = links * dist - - string = str(dist_category) + "\t" + out_scaffa + "\t" + \ - out_scaffb + "\t" + str(links) + "\t" + str(gap) + "\n" - fout.write(string) - - stringr = str(dist_category) + "\t" + rout_scaffb + "\t" + \ - rout_scaffa + "\t" + str(links) + "\t" + str(gap) + "\n" - fout.write(stringr) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Create a XXX.tigpair_checkpoint.tsv file from ARCS graph output ' - 'that LINKS can utilize') - parser.add_argument('graph_file', help='ARCS graph file output (.gv)', type=str) - parser.add_argument('out_file', help='Output file name. Must be named XXX.tigpair_checkpoint.tsv, where XXX ' - 'is same as base name (-b) given to LINKS.', type=str) - parser.add_argument('fasta_file', help='FASTA file of sequences to scaffold', type=str) - args = parser.parse_args() - - readGraphFile(args.graph_file) - makeLinksNumbering(args.fasta_file) - writeTSVFile(args.graph_file, args.out_file) - - diff --git a/Examples/makeTSVfile.py b/Examples/makeTSVfile.py new file mode 120000 index 0000000..ebe5d0a --- /dev/null +++ b/Examples/makeTSVfile.py @@ -0,0 +1 @@ +../bin/makeTSVfile.py \ No newline at end of file diff --git a/README.md b/README.md index 9b166e9..ae27b75 100644 --- a/README.md +++ b/README.md @@ -49,13 +49,13 @@ The ARCS+LINKS pipeline requires two input files: * Reads file in fastq format `*.fq.gz` (or fasta format `*.fa.gz` if using long reads) * For linked reads, ARCS expects an interleaved linked reads file (Barcode sequence expected in the BX tag of the read header or in the form "@readname_barcode" ; Run [Long Ranger basic](https://support.10xgenomics.com/genome-exome/software/pipelines/latest/what-is-long-ranger) on raw chromium reads to produce this interleaved file) -The Makefile located here: Examples/arcs-make will run the full ARCS pipeline. It will also optionally run the misassembly corrector [Tigmint](https://github.com/bcgsc/tigmint) prior to scaffolding with ARCS. +The Makefile located here: bin/arcs-make will run the full ARCS pipeline. It will also optionally run the misassembly corrector [Tigmint](https://github.com/bcgsc/tigmint) prior to scaffolding with ARCS. There are three steps to the pipeline: 1. Run ARCS to generate a Graphviz Dot file (.gv). Nodes in the graph are the sequences to scaffold, and edges show that there is evidence to suggest nodes are linked based on the data obtained from the GemCode/Chromium reads. -2. Run the python script Examples/makeTSVfile.py to generate a file named XXX.tigpair_checkpoint file from the ARCS graph file. The XXX.tigpair_checkpoint file will be provided to LINKS in step 3. +2. Run the python script bin/makeTSVfile.py to generate a file named XXX.tigpair_checkpoint file from the ARCS graph file. The XXX.tigpair_checkpoint file will be provided to LINKS in step 3. 3. Run LINKS with the XXX.tigpair_checkpoint file as input. To do this, the base name (-b) must be set to the same name as XXX. @@ -67,12 +67,12 @@ An example bash script on how to run the ARCS+LINKS pipeline can be found at: Ex The default mode uses alignments of linked reads to contigs to scaffold the input contigs. -To run the pipeline in default mode, run `Examples/arcs-make arcs`. For example, to scaffold the assembly `my_scaffolds.fa` with the interleaved, longranger processed reads `my_reads.fq.gz`, specifying a minimum contig length of 1000bp: +To run the pipeline in default mode, run `bin/arcs-make arcs`. For example, to scaffold the assembly `my_scaffolds.fa` with the interleaved, longranger processed reads `my_reads.fq.gz`, specifying a minimum contig length of 1000bp: ``` arcs-make arcs draft=my_scaffolds reads=my_reads z=1000 ``` -For more info check `Examples/arcs-make help`. +For more info check `bin/arcs-make help`. To run the `arcs` executable in default mode, run `arcs `. For descriptions of all arguments, run `arcs --help`. @@ -80,12 +80,12 @@ To run the `arcs` executable in default mode, run `arcs `. For descr The arcs-long mode first segments and assigns barcodes to the long reads, yielding pseudo-linked reads. Alignments of the pseudo-linked reads are then used to scaffold the input contigs. -To run the pipeline in arcs-long mode, run `Examples/arcs-make arks-long`. For example, to scaffold the assembly `my_scaffolds.fa` with long reads `my_reads.fa.gz` or `my_reads.fq.gz`, specifying a minimum contig length of 1000bp: +To run the pipeline in arcs-long mode, run `bin/arcs-make arks-long`. For example, to scaffold the assembly `my_scaffolds.fa` with long reads `my_reads.fa.gz` or `my_reads.fq.gz`, specifying a minimum contig length of 1000bp: ``` arcs-make arcs-long draft=my_scaffolds reads=my_reads z=1000 ``` -For more info check `Examples/arcs-make help`. +For more info check `bin/arcs-make help`. **Parameters**: To account for the higher error rates in long reads vs linked reads, we suggest starting with the following values: * `m=8-10000` @@ -98,11 +98,11 @@ Note that lowering `c`, `l` and increasing `a` may increase contiguity, but will ### Running ARCS in '--arks' mode -To run the pipeline in ARKS mode, run `Examples/arcs-make arcs`. For example, to scaffold the assembly `my_scaffolds.fa` with the interleaved, longranger processed reads `my_reads.fq.gz`, specifying a kmer size of 60: +To run the pipeline in ARKS mode, run `bin/arcs-make arcs`. For example, to scaffold the assembly `my_scaffolds.fa` with the interleaved, longranger processed reads `my_reads.fq.gz`, specifying a kmer size of 60: ``` arcs-make arks draft=my_scaffolds reads=my_reads k=60 ``` -For more info check `Examples/arcs-make help`. +For more info check `bin/arcs-make help`. To run the `arcs` executable in ARKS mode, run `arcs --arks`. For descriptions of all arguments, run `arcs --help`. @@ -110,7 +110,7 @@ To run the `arcs` executable in ARKS mode, run `arcs --arks`. For descriptions o The arks-long mode first segments and assigns barcodes to the long reads, yielding pseudo-linked reads. Scaffolding is performed based on exact k-mer mapping of pseudo-linked reads to the input contigs. -To run the pipeline in arks-long mode, run `Examples/arcs-make arks-long`. For example, to scaffold the assembly `my_scaffolds.fa` with long reads `my_reads.fa.gz` or `my_reads.fq.gz`, specifying a kmer size of 20 and `j` of 0.05: +To run the pipeline in arks-long mode, run `bin/arcs-make arks-long`. For example, to scaffold the assembly `my_scaffolds.fa` with long reads `my_reads.fa.gz` or `my_reads.fq.gz`, specifying a kmer size of 20 and `j` of 0.05: ``` arcs-make arks-long draft=my_scaffolds reads=my_reads k=20 j=0.05 ``` diff --git a/bin/arcs-make b/bin/arcs-make new file mode 100755 index 0000000..7ef1d81 --- /dev/null +++ b/bin/arcs-make @@ -0,0 +1,317 @@ +#!/usr/bin/make -f +# Pipeline for the ARCS program +# Written by Jeffrey Tse +#Default Parameters + +# Input Names +draft=draft +reads=reads + +# Find the complete long read file name +fastq=$(shell test -f $(reads).fq.gz && echo "true") +fasta=$(shell test -f $(reads).fa.gz && echo "true") +ifeq ($(fastq), true) +long_reads=$(reads).fq.gz +endif +ifeq ($(fasta), true) +long_reads=$(reads).fa.gz +endif + +# tigmint Parameters +minsize=2000 +as=0.65 +nm=5 +dist=50000 +mapq=0 +trim=0 +span=20 +window=1000 + +# ARCS ARKS Common Parameters +barcode_counts=barcodeMultiplicityArcs +c=5 +m=50-10000 +z=500 +s=98 +r=0.05 +e=30000 +D=false +dist_upper=false +d=0 +gap=100 +B=20 + +# ARCS Specific Parameters +s=98 +cut=250 + +# ARKS Specific Parameters +j=0.55 +k=30 +t=8 + +# LINKS Parameters +l=5 +a=0.3 +bin=$(dir $(realpath $(firstword $(MAKEFILE_LIST)))) + +# Control minimap2 index split parameter (-I) +minimap2_index_size=50G + +SHELL=bash -e -o pipefail +ifeq ($(shell zsh -e -o pipefail -c 'true' 2>/dev/null; echo $$?), 0) +#Set pipefail to ensure that all commands of a pipe succeed. +SHELL=zsh -e -o pipefail +# Report run time and memory usage with zsh. +export REPORTTIME=1 +export TIMEFMT=time user=%U system=%S elapsed=%E cpu=%P memory=%M job=%J +endif + +# Use pigz or bgzip for parallel compression if available. +ifneq ($(shell command -v pigz),) +gzip=pigz -p$t +else +ifneq ($(shell command -v bgzip),) +gzip=bgzip -@$t +else +gzip=gzip +endif +endif + +# Record run time and memory usage in a file using GNU time. +ifdef time +ifneq ($(shell command -v gtime),) +gtime=command gtime -v -o $@.time +else +gtime=command time -v -o $@.time +endif +endif + +# Error if user sets LINKS 'a' to 1 +ifeq ($a, 1) +$(error Error: Please set LINKS 'a' parameter to less than 1) +endif + + +.PHONY: all version help clean tigmint arcs arcs-tigmint arcs-with-tigmint arks arks-tigmint arks-with-tigmint +.DELETE_ON_ERROR: +.SECONDARY: + + +all: help +# Help +help: + @echo "Usage: ./arcs-make [COMMAND] [OPTION=VALUE]..." + @echo " Commands:" + @echo "" + @echo " arcs run arcs in default mode only, skipping tigmint" + @echo " arcs-tigmint run tigmint, and run arcs in default mode with the output of tigmint" + @echo " arcs-long run arcs in default mode only, using long instead of linked reads, skipping tigmint" + @echo " arks run arcs in kmer mode only, skipping tigmint" + @echo " arks-tigmint run tigmint, and run arcs in kmer mode with the output of tigmint" + @echo " arks-long run arcs in kmer mode only, using long instead of linked reads, skipping tigmint" + @echo " help display this help page" + @echo " version display the software version" + @echo " clean remove intermediate files" + @echo "" + @echo " General Options:" + @echo "" + @echo " draft draft name [draft]. File must have .fasta or .fa extension" + @echo " reads read name [reads]. File must have .fastq.gz or .fq.gz extension" + @echo " time logs time and memory usage to file for main steps (Set to 1 to enable logging)" + @echo "" + @echo " bwa Options:" + @echo "" + @echo " t number of threads used [8]" + @echo "" + @echo " Tigmint Options:" + @echo "" + @echo " minsize minimum molecule size [2000]" + @echo " as minimum AS/read length ratio [0.65]" + @echo " nm maximum number of mismatches [5]" + @echo " dist max dist between reads to be considered the same molecule [50000]" + @echo " mapq mapping quality threshold [0]" + @echo " trim bp of contigs to trim after cutting at error [0]" + @echo " span min number of spanning molecules to be considered assembled [20]" + @echo " window window size for checking spanning molecules [1000]" + @echo "" + @echo " Common Options:" + @echo "" + @echo " c minimum aligned read pairs per barcode mapping [5]" + @echo " m barcode multiplicity range [50-10000]" + @echo " z minimum contig length [500]" + @echo " r p-value for head/tail assigment and link orientation [0.05]" + @echo " e contig head/tail length for masking aligments [30000]" + @echo " D enable distance estimation [false]" + @echo " dist_upper use upper bound distance over median distance [false]" + @echo " B estimate distance using N closest Jaccard scores [20]" + @echo " d max node degree in scaffold graph [0]" + @echo " gap fixed gap size for dist.gv file [100]" + @echo " barcode_counts name of output barcode multiplicity TSV file [barcodeMultiplicityArcs]" + @echo " cut cut length for long reads (for arcs-long and arks-long only) [250]" + @echo "" + @echo " ARCS Specific Options:" + @echo " s minimum sequence identity [98]" + @echo "" + @echo " ARKS Specific Options:" + @echo " j minimum fraction of read kmers matching a contigId [0.55]" + @echo " k size of a k-mer [30]" + @echo " t number of threads [8]" + @echo "" + @echo " LINKS Options:" + @echo "" + @echo " l minimum number of links to compute scaffold [5]" + @echo " a maximum link ratio between two best contig pairs [0.3]" + @echo "" + @echo "Example: To run tigmint and arcs with myDraft.fa, myReads.fq.gz, and a custom multiplicty range, run:" + @echo " ./arcs-make arcs-tigmint draft=myDraft reads=myReads m=[User defined multiplicty range]" + @echo "To ensure that the pipeline runs correctly, make sure that the following tools are in your PATH: bwa, tigmint, samtools, arcs (>= v1.1.0), LINKS (>= v1.8.6)" + +clean: + rm -f *.amb *.ann *.bwt *.pac *.sa *.dist.gv *.fai *.bed *.molecule.tsv *.sortbx.bam + @echo "Clean Done" + +version: + @echo "arcs-make v1.2.2" + +#Preprocessing + +# Create a .fa file that is soft linked to .fasta +%.fa: %.fasta + ln -s $^ $@ + +# Create a .fq.gz file that is soft linked to .fastq.gz +%.fq.gz: %.fastq.gz + ln -s $^ $@ + + +#Run Tigmint +arcs-tigmint: tigmint arcs-with-tigmint +arks-tigmint: tigmint arks-with-tigmint + +# Main +tigmint: $(draft).tigmint.fa +# Run tigmint +$(draft).tigmint.fa: $(draft).fa $(reads).fq.gz + $(gtime) tigmint tigmint draft=$(draft) reads=$(reads) minsize=$(minsize) as=$(as) nm=$(nm) dist=$(dist) mapq=$(mapq) trim=$(trim) span=$(span) window=$(window) t=$t + +#Pre-processing long reads; cut into shorter segments (pseudo-linked reads) +$(reads).cut$(cut).fq.gz: $(long_reads) + $(gtime) sh -c '$(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 $(long_reads) | $(gzip) > $@' + + +#Run ARCS +arcs: $(draft)_c$c_m$m_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa +arcs-with-tigmint: $(draft).tigmint_c$c_m$m_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa +arcs-long: $(draft)_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa +arks: $(draft)_c$c_m$m_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa +arks-with-tigmint: $(draft).tigmint_c$c_m$m_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa +arks-long: $(draft)_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa + +# Convert Scaffold Names into Numerical Numbers +%.renamed.fa: %.fa + perl -ne 'chomp; if(/>/){$$ct+=1; print ">$$ct\n";}else{print "$$_\n";} ' < $^ > $@ + +# Make bwa index from Draft Assembly +%.renamed.fa.bwt: %.renamed.fa + $(gtime) bwa index $^ + +# Use bwa mem to Align Reads to Draft Assembly and Sort it +%.sorted.bam: %.renamed.fa $(reads).fq.gz %.renamed.fa.bwt + $(gtime) sh -c 'bwa mem -t$t -C -p $< $(reads).fq.gz | samtools view -Sb - | samtools sort -@$t -n - -o $@' + + +# Create an fof File Containing the bam File +%_bamfiles.fof: %.sorted.bam + echo $^ > $@ + +# Run ARCS Program +%_c$c_m$m_s$s_r$r_e$e_z$z_original.gv: %.renamed.fa %_bamfiles.fof +ifneq ($D, true) + $(gtime) arcs -f $< -a $(word 2,$^) -v -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv +else ifneq ($(dist_upper), true) + $(gtime) arcs -D -B $B -v -f $< -a $(word 2,$^) -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv +else + $(gtime) arcs -D -B $B --dist_upper -v -f $< -a $(word 2,$^) -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv +endif + +# long +%_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_original.gv: %.renamed.fa $(reads).cut$(cut).fq.gz +ifneq ($D, true) + $(gtime) minimap2 -ax map-ont -y -t$t --secondary=no -I $(minimap2_index_size) $< $(reads).cut$(cut).fq.gz | abyss-fixmate-ssq --all --qname | \ + samtools view -Sb - | samtools sort -@$t -O SAM -n - -o - | \ + arcs -f $< -v -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv /dev/stdin +else ifneq ($(dist_upper), true) + $(gtime) minimap2 -ax map-ont -y -t$t --secondary=no -I $(minimap2_index_size) $< $(reads).cut$(cut).fq.gz | abyss-fixmate-ssq --all --qname | \ + samtools view -Sb - | samtools sort -@$t -O SAM -n - -o - | \ + arcs -D -B $B -v -f $< -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv /dev/stdin +else + $(gtime) minimap2 -ax map-ont -y -t$t --secondary=no -I $(minimap2_index_size) $< $(reads).cut$(cut).fq.gz | abyss-fixmate-ssq --all --qname | \ + samtools view -Sb - | samtools sort -@$t -O SAM -n - -o - | \ + arcs -D -B $B --dist_upper -v -f $< -c $c -m $m -s $s -r $r -e $e -z $z -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) --barcode-counts $(barcode_counts).tsv /dev/stdin +endif + +# kmer +%_c$c_m$m_k$k_r$r_e$e_z$z_original.gv: %.renamed.fa $(reads).fq.gz +ifneq ($D, true) + $(gtime) arcs --arks -v -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) $(word 2,$^) --barcode-counts $(barcode_counts).tsv + +else ifneq ($(dist_upper), true) + $(gtime) arcs --arks -v -D -B $B -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) $(word 2,$^) --barcode-counts $(barcode_counts).tsv +else + $(gtime) arcs --arks -v -D -B $B --dist_upper -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) $(word 2,$^) --barcode-counts $(barcode_counts).tsv +endif + +# long kmer +$(reads).barcode-multiplicity.tsv: $(long_reads) + $(gtime) $(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 --bx-only -b $@ $< + +%_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_original.gv: %.renamed.fa $(long_reads) $(reads).barcode-multiplicity.tsv +ifneq ($D, true) + $(gtime) $(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 $(long_reads) |\ + arcs --arks -v -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) -u $(reads).barcode-multiplicity.tsv /dev/stdin + +else ifneq ($(dist_upper), true) + $(gtime) $(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 $(long_reads) |\ + arcs --arks -v -D -B $B -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) -u $(reads).barcode-multiplicity.tsv /dev/stdin +else + $(gtime) $(bin)/../src/long-to-linked-pe -l $(cut) -t $t -m2000 $(long_reads) |\ + arcs --arks -v -D -B $B --dist_upper -f $< -c $c -m $m -r $r -e $e -z $z -j $j -k $k -t $t -d $d --gap $(gap) -b $(patsubst %_original.gv,%,$@) -u $(reads).barcode-multiplicity.tsv /dev/stdin +endif + +# Generate TSV from ARCS +# default +%_c$c_m$m_s$s_r$r_e$e_z$z.tigpair_checkpoint.tsv: %_c$c_m$m_s$s_r$r_e$e_z$z_original.gv %.renamed.fa + python $(bin)../Examples/makeTSVfile.py $< $@ $(word 2,$^) +# long +%_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z.tigpair_checkpoint.tsv: %_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_original.gv %.renamed.fa + python $(bin)../Examples/makeTSVfile.py $< $@ $(word 2,$^) +# kmer +%_c$c_m$m_k$k_r$r_e$e_z$z.tigpair_checkpoint.tsv: %_c$c_m$m_k$k_r$r_e$e_z$z_original.gv %.renamed.fa + python $(bin)../Examples/makeTSVfile.py $< $@ $(word 2,$^) +# long kmer +%_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z.tigpair_checkpoint.tsv: %_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_original.gv %.renamed.fa + python $(bin)../Examples/makeTSVfile.py $< $@ $(word 2,$^) + +# Adds a and l paramters to the filename +%_z$z_l$l_a$a.tigpair_checkpoint.tsv: %_z$z.tigpair_checkpoint.tsv + ln -sf $^ $@ + +# Make an Empty fof File +empty.fof: + touch $@ + +# Run LINKS +# default +%_c$c_m$m_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.renamed.fa empty.fof %_c$c_m$m_s$s_r$r_e$e_z$z_l$l_a$a.tigpair_checkpoint.tsv + $(gtime) LINKS -f $< -s empty.fof -b $(patsubst %.scaffolds.fa,%,$@) -l $l -a $a -z $z +# long +%_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.renamed.fa empty.fof %_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.tigpair_checkpoint.tsv + $(gtime) LINKS -f $< -s empty.fof -b $(patsubst %.scaffolds.fa,%,$@) -l $l -a $a -z $z +# kmer +%_c$c_m$m_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.renamed.fa empty.fof %_c$c_m$m_k$k_r$r_e$e_z$z_l$l_a$a.tigpair_checkpoint.tsv + $(gtime) LINKS -f $< -s empty.fof -b $(patsubst %.scaffolds.fa,%,$@) -l $l -a $a -z $z +# long kmer +%_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.renamed.fa empty.fof %_c$c_m$m_cut$(cut)_k$k_r$r_e$e_z$z_l$l_a$a.tigpair_checkpoint.tsv + $(gtime) LINKS -f $< -s empty.fof -b $(patsubst %.scaffolds.fa,%,$@) -l $l -a $a -z $z + diff --git a/bin/makeTSVfile.py b/bin/makeTSVfile.py new file mode 100644 index 0000000..3d2c347 --- /dev/null +++ b/bin/makeTSVfile.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python +""" +Create a tigpair_checkpoint file from ARCS output that LINKS can utilize +""" +import re +import argparse + +index2scaff_name = {} +links_numbering = {} + +def readGraphFile(infile): + """Read ARCS graph file output (.gv)""" + with open(infile, 'r') as f: + for line in f: + test = re.match(r"(\d+)\s+\[id=\"?([^\]\"]+)\"?\]", line.rstrip()) + if test: + index = test.group(1) + scaff_name = test.group(2) + if scaff_name not in links_numbering: + index2scaff_name[index] = scaff_name + +def makeLinksNumbering(scaffolds_fasta): + """""" + counter = 0 + with open(scaffolds_fasta, 'r') as f: + for line in f: + if line[0] == ">": + seq_id = line.rstrip().split()[0][1:] + counter += 1 + links_numbering[seq_id] = str(counter) + + +def writeTSVFile(infile, outfile): + """Create tigpair_checkpoint file""" + with open(infile, 'r') as fin: + with open(outfile, 'w') as fout: + for line in fin: + test = re.search(r"(\d+)--(\d+)\s+\[label=(\d+), weight=(\d+)", line.rstrip()) + if test: + scaffa = index2scaff_name[test.group(1)] + scaffb = index2scaff_name[test.group(2)] + label = int(test.group(3)) + links = int(test.group(4)) + + if scaffa > scaffb: + scaffa, scaffb = scaffb, scaffa + + out_scaffa = "" + out_scaffb = "" + rout_scaffa = "" + rout_scaffb = "" + out_scaffa = out_scaffb = rout_scaffa = rout_scaffb = "" + + # HH : rA->B or rB->A + if label == 0: + out_scaffa = "r" + links_numbering[scaffa] + out_scaffb = "f" + links_numbering[scaffb] + + rout_scaffb = "r" + links_numbering[scaffb] + rout_scaffa = "f" + links_numbering[scaffa] + # HT : rA->rB or B->A + elif label == 1: + out_scaffa = "r" + links_numbering[scaffa] + out_scaffb = "r" + links_numbering[scaffb] + + rout_scaffb = "f" + links_numbering[scaffb] + rout_scaffa = "f" + links_numbering[scaffa] + # TH : A->B or rB->rA + elif label == 2: + out_scaffa = "f" + links_numbering[scaffa] + out_scaffb = "f" + links_numbering[scaffb] + + rout_scaffb = "r" + links_numbering[scaffb] + rout_scaffa = "r" + links_numbering[scaffa] + # TT : A->rB or B->rA + elif label == 3: + out_scaffa = "f" + links_numbering[scaffa] + out_scaffb = "r" + links_numbering[scaffb] + + rout_scaffb = "f" + links_numbering[scaffb] + rout_scaffa = "r" + links_numbering[scaffa] + + match = re.search(r"d=(\d+)", line.rstrip()) + if match: + dist = int(match.group(1)) + est_dist = True + else: + dist = 100 + est_dist = False + + if dist < 0: + dist_category = -1 + elif dist == 100 and est_dist is False: + dist_category = 10 + elif dist < 500: + dist_category = 500 + elif dist < 1000: + dist_category = 1000 + elif dist < 5000: + dist_category = 5000 + else: + dist_category = 10000 + + + gap = links * dist + + string = str(dist_category) + "\t" + out_scaffa + "\t" + \ + out_scaffb + "\t" + str(links) + "\t" + str(gap) + "\n" + fout.write(string) + + stringr = str(dist_category) + "\t" + rout_scaffb + "\t" + \ + rout_scaffa + "\t" + str(links) + "\t" + str(gap) + "\n" + fout.write(stringr) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Create a XXX.tigpair_checkpoint.tsv file from ARCS graph output ' + 'that LINKS can utilize') + parser.add_argument('graph_file', help='ARCS graph file output (.gv)', type=str) + parser.add_argument('out_file', help='Output file name. Must be named XXX.tigpair_checkpoint.tsv, where XXX ' + 'is same as base name (-b) given to LINKS.', type=str) + parser.add_argument('fasta_file', help='FASTA file of sequences to scaffold', type=str) + args = parser.parse_args() + + readGraphFile(args.graph_file) + makeLinksNumbering(args.fasta_file) + writeTSVFile(args.graph_file, args.out_file) + +