Skip to content

Commit

Permalink
Add chisel_rdr command
Browse files Browse the repository at this point in the history
  • Loading branch information
simozacca committed Jul 28, 2022
1 parent 5a567b8 commit 5bd8fec
Show file tree
Hide file tree
Showing 2 changed files with 324 additions and 1 deletion.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,6 @@
'chisel_plotting=chisel.bin.chisel_plotting:main',
'chisel_pseudonormal=chisel.bin.chisel_pseudonormal:main',
'chisel_prep=chisel.bin.chisel_prep:main',
'chisel_bedding=chisel.bin.chisel_bedding:main']}
'chisel_bedding=chisel.bin.chisel_bedding:main',
'chisel_rdr=chisel.bin.chisel_rdr:main']}
)
322 changes: 322 additions & 0 deletions src/chisel/bin/chisel_rdr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
#!/usr/bin/env python2.7

import os, sys
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
import argparse
import shlex
import shutil
import glob
import re
import traceback
import subprocess as sp
import multiprocessing as mp

import numpy as np

import chisel
src = os.path.dirname(chisel.__file__)
from ..Utils import *
from chisel.RDREstimator import get_bins


def parse_args():
description = "CHISEL command to only compute RDRs per cell."
parser = argparse.ArgumentParser(description=description)
parser.add_argument("-x","--rundir", required=False, default='./', type=str, help="Running directory (default: current directory)")
parser.add_argument("-t","--tumor", required=True, type=str, help="Barcoded single-cell BAM file")
parser.add_argument("-r","--reference", type=str, required=True, help="Reference genome")
parser.add_argument("-b","--size", type=str, required=False, default="250kb", help="Bin size, with or without \"kb\" or \"Mb\" (default: 250kb)")
parser.add_argument("-c", "--chromosomes", type=str, required=False, default=' '.join(['chr{}'.format(i) for i in range(1, 23)]), help="Space-separeted list of chromosomes between apices (default: \"chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22\")")
parser.add_argument("-m","--minreads", type=int, required=False, default=100000, help="Minimum number total reads to select cells (default: 100000)")
parser.add_argument("--bcftools", required=False, default=None, type=str, help="Path to the directory to \"bcftools\" executable, required in default mode (default: bcftools is directly called as it is in user $PATH)")
parser.add_argument("--samtools", required=False, default=None, type=str, help="Path to the directory to \"samtools\" executable, required in default mode (default: samtools is directly called as it is in user $PATH)")
parser.add_argument("--art", required=False, default=None, type=str, help="Path to the directory to \"art_illumina\" executable (default: in $PATH)")
parser.add_argument("--bwa", required=False, default=None, type=str, help="Path to the directory to \"bwa\" executable (default: in $PATH)")
parser.add_argument("--cellprefix", type=str, required=False, default='CB:Z:', help="Prefix of cell barcode field in SAM format (default: CB:Z:)")
parser.add_argument("--cellsuffix", type=str, required=False, default=None, help="Suffix of cell barcode field in SAM format (default: none)")
parser.add_argument("--simcov", required=False, type=float, default=.2, help="Sequencing fold coverage of simulated normal BAM file (default: 0.2)")
parser.add_argument("--binstats", required=False, type=int, default=None, help="Number of bins to sample per chromosome to estimate sequencing stats (default: all are used, fix a number for improving speed)")
parser.add_argument("--keeptmpdir", required=False, default=False, action='store_true', help="Do not erase temporary directory (default: False)")
parser.add_argument("--seed", required=False, type=int, default=None, help="Random seed for replication (default: None)")
parser.add_argument("-j","--jobs", required=False, type=int, default=0, help="Number of parallele jobs to use (default: equal to number of available processors)")
args = parser.parse_args()

if args.seed is not None:
np.random.seed(args.seed)

if not os.path.isdir(args.rundir):
raise ValueError("Running directory does not exists: {}".format(args.rundir))
if not os.path.isfile(args.tumor):
raise ValueError("Barcoded single-cell BAM file does not exist: {}".format(args.tumor))
if args.seed and args.seed < 1:
raise ValueError("The random seed must be positive!")
if args.minreads < 1:
raise ValueError("The minimum number of reads must be positive!")
if args.simcov <= 0.0:
raise ValueError("The sequencing coverage of simulated normal must be >= 0.0!")
if args.binstats is not None and args.binstats <= 0:
raise ValueError("The number of bins for sequencing stats must be >= 0.0!")

if not os.path.isfile(args.reference):
raise ValueError(error("Reference genome file does not exist: {}".format(args.reference)))
refidx = ['{}.{}'.format(args.reference, ix) for ix in ['amb', 'ann', 'bwt', 'pac', 'sa']]
if not all(os.path.isfile(f) for f in refidx):
raise ValueError(error("Some of the BWA index files are missing, please make sure these are available and generated through the command \n\t``bwa index {}''.\n Expected files are: {}".format(args.reference, '\n'.join(refidx))))

size = 0
try:
if args.size[-2:] == "kb":
size = int(args.size[:-2]) * 1000
elif args.size[-2:] == "Mb":
size = int(args.size[:-2]) * 1000000
else:
size = int(args.size)
except:
raise ValueError("Size must be a number, optionally ending with either \"kb\" or \"Mb\"!")

if not args.jobs:
args.jobs = mp.cpu_count()
if args.jobs < 1:
raise ValueError("The number of jobs must be positive!")

bcftools = args.bcftools
if not bcftools:
bcftools = "bcftools"
if which(bcftools) is None:
raise ValueError("bcftools has not been found or is not executable!")

samtools = args.samtools
if not samtools:
samtools = "samtools"
if which(samtools) is None:
raise ValueError("samtools has not been found or is not executable!")

art = args.art
if not art:
art = "art_illumina"
if which(art) is None:
raise ValueError(error("art_illumina has not been found or is not executable!\n\nIf you are within a CHISEL conda environment ${ENV} you can install it with:\n\tconda install -c bioconda -n ${ENV} bwa\n\nOtherwise, please provide with the flag `--art` the full path to the directory containing art_illumina exacutable."))

bwa = args.bwa
if not bwa:
bwa = "bwa"
if which(bwa) is None and not args.barcodeonly:
raise ValueError(error("bwa has not been found or is not executable!\n\nIf you are within a CHISEL conda environment ${ENV} you can install it with:\n\tconda install -c bioconda -n ${ENV} bwa\n\nOtherwise, please provide with the flag `--bwa` the full path to the directory containing bwa exacutable."))


return {
"rundir" : os.path.abspath(args.rundir),
"tumor" : os.path.abspath(args.tumor),
"reference" : os.path.abspath(args.reference),
"binsize" : size,
"chromosomes" : args.chromosomes,
"minreads" : args.minreads,
"bcftools" : bcftools,
"samtools" : samtools,
"bwa" : bwa,
"art" : art,
"cellprefix" : args.cellprefix,
"cellsuffix" : args.cellsuffix,
"simcov" : args.simcov,
"binstats" : args.binstats,
"keeptmpdir" : args.keeptmpdir,
"seed" : args.seed,
"jobs" : args.jobs
}


def main():
log('Parsing and checking arguments', level='PROGRESS')
args = parse_args()
log('\n'.join(['Arguments:'] + ['\t{} : {}'.format(a, args[a]) for a in args]) + '\n', level='INFO')

log('Setting directories', level='PROGRESS')
ddata, dbaf, drdr, dcom, dcal, dclo, dplo = setup(args)
def get_comp(name):
comp = os.path.join(src, name)
if not os.path.isfile(comp):
raise ValueError("{} not found in src directory of bin i.e. {}, is anything been moved?".format(name, src))
return comp

log('Simulating DNA sequencing reads for mappability correction', level='PROGRESS')
args['normal'] = os.path.join(ddata, 'simulatednormal.bam')
simulate_sequencing(args)

log('Computing RDRs', level='PROGRESS')
cmd = 'python2.7 {} -n {} -t {} -r {} -b {} -m {} -j {} -c \"{}\" --outdir {}'
cmd = cmd.format(get_comp('RDREstimator.py'), args['normal'], args['tumor'], args['reference'], args['binsize'], args['minreads'], args['jobs'], args['chromosomes'], drdr)
if args['samtools'] is not None:
cmd += " -s {}".format(args['samtools'])
cmd += " --cellprefix {}".format(args['cellprefix'])
if args['cellsuffix'] is not None:
cmd += " --cellsuffix {}".format(args['cellsuffix'])
runcmd(cmd, drdr, out='rdr.tsv')
lcel = os.path.join(drdr, 'total.tsv')
rdr = os.path.join(drdr, 'rdr.tsv')


def setup(args):
if any(os.path.isdir(os.path.join(args['rundir'], x)) for x in ['data', 'baf', 'rdr', 'combo', 'calls', 'clones', 'plots']):
log('Some of the working folders already exist in the running directory and content will be overwritten, please interrupt the process if this was not intended.', level='WARN')

ddata = os.path.join(args['rundir'], 'data')
if not os.path.isdir(ddata):
os.mkdir(ddata)

dbaf = os.path.join(args['rundir'], 'baf')
if not os.path.isdir(dbaf):
os.mkdir(dbaf)

drdr = os.path.join(args['rundir'], 'rdr')
if not os.path.isdir(drdr):
os.mkdir(drdr)

dcom = os.path.join(args['rundir'], 'combo')
if not os.path.isdir(dcom):
os.mkdir(dcom)

dcal = os.path.join(args['rundir'], 'calls')
if not os.path.isdir(dcal):
os.mkdir(dcal)

dclo = os.path.join(args['rundir'], 'clones')
if not os.path.isdir(dclo):
os.mkdir(dclo)

dplo = os.path.join(args['rundir'], 'plots')
if not os.path.isdir(dplo):
os.mkdir(dplo)

return ddata, dbaf, drdr, dcom, dcal, dclo, dplo


def simulate_sequencing(args):
log('Retrieving sequencing stats', level='INFO')
bins = get_bins(args['reference'], args['chromosomes'].split(), args['binsize'], bams=[args['tumor']], samtools=args['samtools'])
if args['binstats'] is not None:
bins = {c : [bins[c][x] for x in np.random.choice(len(bins[c]), min(args['binstats'], len(bins[c])))] for c in bins}
jobs = [(chro, cbin, args) for chro in bins for cbin in bins[chro]]
bar = ProgressBar(total=len(jobs), length=30, verbose=False)
pool = mp.Pool(processes=min(args['jobs'], len(jobs)))
progress = (lambda e : bar.progress(advance=True, msg="Stats retrieved for {}:{}-{}".format(*e)))
bamstats = [m for e, m in pool.imap_unordered(get_bam_stats, jobs) if progress(e)]
bamstats = list(filter(lambda v : v['read'] > 0.0 or v['fragment'] > 0.0, bamstats))

avg = (lambda L : sum(L) / float(len(L)))
if any(s['fragment'] > 0 for s in bamstats):
bamstats = [s for s in bamstats if s['read'] > 0.0 or s['fragment'] > 0.0]
read = int(round(avg([s['read'] for s in bamstats])))
fragment = int(round(avg([s['fragment'] for s in bamstats])))
sdfragment = int(round(avg([s['sd'] for s in bamstats])))
log('BAM has been identified as paired-end sequencing with read length {} and fragment size {} (sd: {})'.format(read, fragment, sdfragment), level='INFO')
sequencing_paired(args, read, fragment, sdfragment)
else:
bamstats = [s for s in bamstats if s['read'] > 0.0]
read = int(round(avg([s['read'] for s in bamstats])))
log('BAM has been identified as paired-end sequencing with read length {}'.format(read), level='INFO')
sequencing_single(args, read)
assert os.path.isfile(args['normal'])
log('Simulated normal sequencing reads are written at: {}'.format(args['normal']), level='INFO')


def get_bam_stats(dt):
chro, bins, args = dt
cmd_sam = '{} stats {} {}:{}-{}'.format(args['samtools'], args['tumor'], chro, bins[0], bins[1])
cmd_gre = 'grep -e "average length" -e "insert size average" -e "insert size standard deviation"'
sam = sp.Popen(shlex.split(cmd_sam), stdout=sp.PIPE, stderr=sp.PIPE)
gre = sp.Popen(shlex.split(cmd_gre), stdin=sam.stdout, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = gre.communicate()
if gre.returncode != 0:
raise ValueError(error('Merging failed with messages:\n{}\n{}\n'.format(stdout, stderr)))
result = stdout.strip().split('\n')
if len(result) != 3:
raise ValueError(error('More or less than expected BAM statistics have been found:\n{}\n'.format('\n'.join(result))))
process = (lambda r : 'read' if 'average length' in r else ('fragment' if 'insert size average' in r else 'sd'))
check = (lambda v : float(v) if v.replace('.', '', 1).isdigit() else 0)
return (chro, bins[0], bins[1]), {process(rec) : check(rec.split(':')[-1].strip()) for rec in result}


def sequencing_paired(args, read, fragment, sdfragment):
tmpdir = os.path.join(os.path.dirname(args['normal']), '_TMPDIR')
if not os.path.exists(tmpdir):
os.mkdir(tmpdir)

log('Simulating sequencing reads', level='INFO')
pre_fastq = os.path.join(tmpdir, 'normal')
profile = 'HS20' if abs(100 - read) < abs(125 - read) else 'HS25'
cmd = '{} -ss {} -na -i {} -p -l {} -f {} -m {} -s {} -o {}'.format(args['art'], profile, args['reference'], read, args['simcov'], fragment, sdfragment, pre_fastq)
with open(os.path.join(tmpdir, 'art_illumina.log'), 'w') as simlog:
proc = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=simlog)
stdout, stderr = proc.communicate()
fastqs = ('{}1.fq'.format(pre_fastq), '{}2.fq'.format(pre_fastq))
if proc.returncode != 0 or not (os.path.exists(fastqs[0]) and os.path.exists(fastqs[1])):
raise ValueError(error('ART Illumina simuation of sequencing reads failed:\n{}\n{}\n'.format(stdout, stderr)))

log('Aligning simulated sequencing reads', level='INFO')
jobs = max(1, int(round(args['jobs'] / 2.0)))
cmd_bwa = '{} mem -M -t {} {} {} {}'.format(args['bwa'], jobs, args['reference'], fastqs[0], fastqs[1])
cmd_sor = '{} sort - -Obam -o {} -T {} -@ {}'.format(args['samtools'], args['normal'], tmpdir, max(1, args['jobs'] - jobs))
bwa = sp.Popen(shlex.split(cmd_bwa), stdout=sp.PIPE, stderr=sp.PIPE)
sor = sp.Popen(shlex.split(cmd_sor), stdin=bwa.stdout, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = sor.communicate()
if sor.returncode != 0 or not os.path.exists(args['normal']):
raise ValueError(error('Alignment failed with messages:\n{}\n{}\n'.format(stdout, stderr)))

log('Indexing simulated sequencing reads', level='INFO')
cmd_sam = '{} index {}'.format(args['samtools'], args['normal'])
sam = sp.Popen(shlex.split(cmd_sam), stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = sam.communicate()
fastqs = ('{}1.fq'.format(pre_fastq), '{}2.fq'.format(pre_fastq))
if sor.returncode != 0:
raise ValueError(error('ART Illumina simuation of sequencing reads failed:\n{}\n{}\n'.format(stdout, stderr)))

if not args['keeptmpdir']:
shutil.rmtree(tmpdir)
return


def sequencing_single(args, read):
tmpdir = os.path.join(os.path.dirname(args['normal']), '_TMPDIR')
if not os.path.exists(tmpdir):
os.mkdir(tmpdir)

log('Simulating sequencing reads', level='INFO')
pre_fastq = os.path.join(tmpdir, 'normal')
profile = 'HS20' if abs(100 - read) < abs(125 - read) else 'HS25'
cmd = '{} -ss {} -na -i {} -l {} -f {} -o {}'.format(args['art'], profile, args['reference'], read, args['simcov'], pre_fastq)
with open(os.path.join(tmpdir, 'art_illumina.log'), 'w') as simlog:
proc = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=simlog)
stdout, stderr = proc.communicate()
fastq = '{}.fq'.format(pre_fastq)
if proc.returncode != 0 or not os.path.exists(fastq):
raise ValueError(error('ART Illumina simuation of sequencing reads failed:\n{}\n{}\n'.format(stdout, stderr)))

log('Aligning simulated sequencing reads', level='INFO')
jobs = max(1, int(round(args['jobs'] / 2.0)))
cmd_bwa = '{} mem -M -t {} {} {}'.format(args['bwa'], jobs, args['reference'], fastq)
cmd_sor = '{} sort - -Obam -o {} -T {} -@ {}'.format(args['samtools'], args['normal'], tmpdir, max(1, args['jobs'] - jobs))
bwa = sp.Popen(shlex.split(cmd_bwa), stdout=sp.PIPE, stderr=sp.PIPE)
sor = sp.Popen(shlex.split(cmd_sor), stdin=bwa.stdout, stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = sor.communicate()
if sor.returncode != 0 or not os.path.exists(args['normal']):
raise ValueError(error('Alignment failed with messages:\n{}\n{}\n'.format(stdout, stderr)))

log('Indexing simulated sequencing reads', level='INFO')
cmd_sam = '{} index {}'.format(args['samtools'], args['normal'])
sam = sp.Popen(shlex.split(cmd_sam), stdout=sp.PIPE, stderr=sp.PIPE)
stdout, stderr = sam.communicate()
fastqs = ('{}1.fq'.format(pre_fastq), '{}2.fq'.format(pre_fastq))
if sor.returncode != 0:
raise ValueError(error('ART Illumina simuation of sequencing reads failed:\n{}\n{}\n'.format(stdout, stderr)))

if not args['keeptmpdir']:
shutil.rmtree(tmpdir)
return


if __name__ == '__main__':
main()

0 comments on commit 5bd8fec

Please sign in to comment.