Skip to content

Draft bin sets with CONCOCT,MaxBin2, and MetaBAT2

Francisco Zorrilla edited this page Mar 26, 2021 · 5 revisions

Generating contig coverage across samples

metaGEM uses well established binners to generate three draft bins sets which are later de-replicated.

First, we need to generate contig coverage information across samples by mapping each set of paired end reads to each generated assembly.

By default metaGEM will generate the input for the individual binners using the crossMap Snakefile rule as follows:

rule crossMap:  
    input:
        contigs = rules.megahit.output,
        reads = f'{config["path"]["root"]}/{config["folder"]["qfiltered"]}'
    output:
        concoct = directory(f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/cov'),
        metabat = directory(f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/cov'),
        maxbin = directory(f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/cov')
    benchmark:
        f'{config["path"]["root"]}/benchmarks/{{IDs}}.crossMap.benchmark.txt'
    message:
        """
        Use this approach to provide all 3 binning tools with cross-sample coverage information.
        Will likely provide superior binning results, but may no be feasible for datasets with 
        many large samples such as the tara oceans dataset. 
        """
    shell:
        """
        set +u;source activate {config[envs][metabagpipes]};set -u;
        cd $SCRATCHDIR
        cp {input.contigs} .
        mkdir -p {output.concoct}
        mkdir -p {output.metabat}
        mkdir -p {output.maxbin}
        # Define the focal sample ID, fsample: 
        # The one sample's assembly that all other samples' read will be mapped against in a for loop
        fsampleID=$(echo $(basename $(dirname {input.contigs})))
        echo -e "\nFocal sample: $fsampleID ... "
        echo "Renaming and unzipping assembly ... "
        mv $(basename {input.contigs}) $(echo $fsampleID|sed 's/$/.fa.gz/g')
        gunzip $(echo $fsampleID|sed 's/$/.fa.gz/g')
        echo -e "\nIndexing assembly ... "
        bwa index $fsampleID.fa
        
        for folder in {input.reads}/*;do 
                id=$(basename $folder)
                echo -e "\nCopying sample $id to be mapped against the focal sample $fsampleID ..."
                cp $folder/*.gz .
                
                # Maybe I should be piping the lines below to reduce I/O ?
                echo -e "\nMapping sample to assembly ... "
                bwa mem -t {config[cores][crossMap]} $fsampleID.fa *.fastq.gz > $id.sam
                
                echo -e "\nConverting SAM to BAM with samtools view ... " 
                samtools view -@ {config[cores][crossMap]} -Sb $id.sam > $id.bam
                echo -e "\nSorting BAM file with samtools sort ... " 
                samtools sort -@ {config[cores][crossMap]} -o $id.sort $id.bam
                echo -e "\nRunning jgi_summarize_bam_contig_depths script to generate contig abundance/depth file for maxbin2 input ... "
                jgi_summarize_bam_contig_depths --outputDepth $id.depth $id.sort
                echo -e "\nMoving depth file to sample $fsampleID maxbin2 folder ... "
                mv $id.depth {output.maxbin}
                echo -e "\nIndexing sorted BAM file with samtools index for CONCOCT input table generation ... " 
                samtools index $id.sort
                echo -e "\nRemoving temporary files ... "
                rm *.fastq.gz *.sam *.bam
        done
        
        nSamples=$(ls {input.reads}|wc -l)
        echo -e "\nDone mapping focal sample $fsampleID agains $nSamples samples in dataset folder."
        echo -e "\nRunning jgi_summarize_bam_contig_depths for all sorted bam files to generate metabat2 input ... "
        jgi_summarize_bam_contig_depths --outputDepth $id.all.depth *.sort
        echo -e "\nMoving input file $id.all.depth to $fsampleID metabat2 folder... "
        mv $id.all.depth {output.metabat}
        echo -e "Done. \nCutting up contigs to 10kbp chunks (default), not to be used for mapping!"
        cut_up_fasta.py -c {config[params][cutfasta]} -o 0 -m $fsampleID.fa -b assembly_c10k.bed > assembly_c10k.fa
        echo -e "\nSummarizing sorted and indexed BAM files with concoct_coverage_table.py to generate CONCOCT input table ... " 
        concoct_coverage_table.py assembly_c10k.bed *.sort > coverage_table.tsv
        echo -e "\nMoving CONCOCT input table to $fsampleID concoct folder"
        mv coverage_table.tsv {output.concoct}
        """

This works well for small-to-medium-sized datasets, where there are approximately up to 150 samples with roughly 3 Gbp each, such as the gut mircrobiome dataset used in the metaGEM publication.

Next we can run each individual binner using contig coverage information across all samples in the dataset.

CONCOCT

CONCOCT is implemented as follows:

rule concoct:
    input:
        table = f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/cov/coverage_table.tsv',
        contigs = rules.megahit.output
    output:
        directory(f'{config["path"]["root"]}/{config["folder"]["concoct"]}/{{IDs}}/{{IDs}}.concoct-bins')
    benchmark:
        f'{config["path"]["root"]}/benchmarks/{{IDs}}.concoct.benchmark.txt'
    shell:
        """
        set +u;source activate {config[envs][metabagpipes]};set -u;
        mkdir -p $(dirname $(dirname {output}))
        cd $SCRATCHDIR
        cp {input.contigs} {input.table} $SCRATCHDIR
        echo "Unzipping assembly ... "
        gunzip $(basename {input.contigs})
        echo -e "Done. \nCutting up contigs (default 10kbp chunks) ... "
        cut_up_fasta.py -c {config[params][cutfasta]} -o 0 -m $(echo $(basename {input.contigs})|sed 's/.gz//') > assembly_c10k.fa
        
        echo -e "\nRunning CONCOCT ... "
        concoct --coverage_file $(basename {input.table}) \
            --composition_file assembly_c10k.fa \
            -b $(basename $(dirname {output})) \
            -t {config[cores][concoct]} \
            -c {config[params][concoct]}
            
        echo -e "\nMerging clustering results into original contigs ... "
        merge_cutup_clustering.py $(basename $(dirname {output}))_clustering_gt1000.csv > $(basename $(dirname {output}))_clustering_merged.csv
        
        echo -e "\nExtracting bins ... "
        mkdir -p $(basename {output})
        extract_fasta_bins.py $(echo $(basename {input.contigs})|sed 's/.gz//') $(basename $(dirname {output}))_clustering_merged.csv --output_path $(basename {output})
        
        mkdir -p $(dirname {output})
        mv $(basename {output}) *.txt *.csv $(dirname {output})
        """

MaxBin2

MaxBin2 is implemented as follows:

rule maxbinCross:
    input:
        assembly = rules.megahit.output,
        depth = f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/cov'
    output:
        directory(f'{config["path"]["root"]}/{config["folder"]["maxbin"]}/{{IDs}}/{{IDs}}.maxbin-bins')
    benchmark:
        f'{config["path"]["root"]}/benchmarks/{{IDs}}.maxbin.benchmark.txt'
    shell:
        """
        set +u;source activate {config[envs][metabagpipes]};set -u;
        cp -r {input.assembly} {input.depth}/*.depth $SCRATCHDIR
        mkdir -p $(dirname {output})
        cd $SCRATCHDIR
        echo -e "\nUnzipping assembly ... "
        gunzip $(basename {input.assembly})
        echo -e "\nGenerating list of depth files based on crossMap rule output ... "
        find . -name "*.depth" > abund.list
        
        echo -e "\nRunning maxbin2 ... "
        run_MaxBin.pl -contig contigs.fasta -out $(basename $(dirname {output})) -abund_list abund.list
        rm *.depth abund.list contigs.fasta
        mkdir -p $(basename {output})
        mv *.fasta $(basename {output})
        mv * $(dirname {output})
        """

MetaBAT2

MetaBAT2 is implemented as follows:

rule metabatCross:
    input:
        assembly = rules.megahit.output,
        depth = f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/cov'
    output:
        directory(f'{config["path"]["root"]}/{config["folder"]["metabat"]}/{{IDs}}/{{IDs}}.metabat-bins')
    benchmark:
        f'{config["path"]["root"]}/benchmarks/{{IDs}}.metabat.benchmark.txt'
    shell:
        """
        set +u;source activate {config[envs][metabagpipes]};set -u;
        cp {input.assembly} {input.depth}/*.all.depth $SCRATCHDIR
        mkdir -p $(dirname {output})
        cd $SCRATCHDIR
        gunzip $(basename {input.assembly})
        echo -e "\nRunning metabat2 ... "
        metabat2 -i contigs.fasta -a *.all.depth -s {config[params][metabatMin]} -v --seed {config[params][seed]} -t 0 -m {config[params][minBin]} -o $(basename $(dirname {output}))
        mkdir -p {output}
        mv *.fa {output}
        """