diff --git a/.gitignore b/.gitignore
index 51cbe85..e66e0f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,15 @@
 __pycache__/
 *.py[cod]
 
+# emacs tmp files
+*~
+
 # C extensions
 *.so
 
+# snakemake files
+.snakemake
+
 # Distribution / packaging
 .Python
 env/
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..a8f46fe
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,31 @@
+Version 0.2 - September 3, 2016
+-----------
+
+Version 0.2 of WASP is a major update to the code,
+especially the mapping code. It fixes several bugs related
+to how paired-end reads are handled. For this reason it is
+strongly recommended that users switch to this version
+of the pipline.
+
+Changes include:
+* re-wrote mapping scripts to make simpler and more modular
+* re-wrote mapping test scripts and added of many tests
+* fixed several mapping pipeline bugs related to paired-end reads
+* find_intersecting_snps.py window size no longer required (is now
+	unlimited)
+* find_intersecting_snps.py can now take HDF5 files as input
+* find_intersecting_snps.py can now consider only haplotypes
+	present in samples, rather than all possible allelic combinations
+	of SNPs overlapping reads.
+* added get_as_counts.py script that outputs allele-specific read
+	counts at all polymorphic SNPs. 
+* snp2h5 now records sample info in output HDF5 files
+* improved speed of many CHT pipeline steps
+* improved stability of CHT dispersion parameter estimation
+* added Snakemake workflows for both mapping and CHT pipelines
+* added qqplot.R script to CHT workflow
+
+
+Version 0.1
+-----------
+Initial version of WASP
diff --git a/CHT/.gitignore b/CHT/.gitignore
index ded6067..400b95a 100644
--- a/CHT/.gitignore
+++ b/CHT/.gitignore
@@ -1,5 +1,11 @@
 *.py[cod]
 
+# snakemake files
+.snakemake
+
+# emacs backups
+*~
+
 # C extensions
 *.so
 
diff --git a/CHT/README.md b/CHT/README.md
index b154708..0dac2f5 100644
--- a/CHT/README.md
+++ b/CHT/README.md
@@ -93,11 +93,15 @@ the [1000 Genomes website](http://www.1000genomes.org/data#DataAccess).
 
 ##  Workflow
 
-An example workflow is provided in [example_workflow.sh](../example_workflow.sh)
-script. This workflow uses data in the [example_data directory](../example_data).
+We now provide a Snakemake workflow that can be used to run the entire
+CHT pipeline. For more information see the [Snakemake README](README.snakemake.md)
+
+An example workflow in the form of a shell script is also provided in
+[example_workflow.sh](../example_workflow.sh) script. This workflow uses
+data in the [example_data directory](../example_data).
 
 Some of the input files that we used for our paper can be downloaded from 
-[here](http://eqtl.uchicago.edu/histone_mods/haplotype_read_counts/). 
+[here](http://eqtl.uchicago.edu/histone_mods/).
 
 The following steps can be used to generate input files and run the
 Combined Haplotype Test. The examples given below use the example
@@ -166,9 +170,16 @@ For example, if the goal is to identify histone-mark QTLs, the target
 regions should be ChIP-seq peaks, and the test SNPs should be SNPs
 that are near-to or within the ChIP-seq peaks.
 
+*Note (added 4/25/2016):* the target regions for a single test regions should be 
+non-overlapping. Overlapping target regions can cause some reads
+to be counted multiple times in a single test and inflate the test
+statistic. We plan to add a check for this to the extract haplotype
+read counts.
+
 If the goal is to identify eQTLs, the target regions should be the
 exons of genes, and the test SNPs could be SNPs within a specified
-distance of the TSS.
+distance of the TSS. If a gene contains overlapping or duplicate
+exons these should be collapsed.
 
 We provide a script, `get_target_regions.py`, that can generate a list
 of target regions and test SNPs for ChIP-seq peaks that match
@@ -371,5 +382,5 @@ The following example shows how the first 2 PCs can be used as covariates (repla
 ## Contact
 
 For questions about the combined haplotype test, please contact Graham McVicker 
-(gpm@stanford.edu) or Bryce van de Geijn (bmvdgeijn@uchicago.edu).
+(gmcvicker@salk.edu) or Bryce van de Geijn (vandegeijn@hsph.harvard.edu).
 
diff --git a/CHT/README.snakemake.md b/CHT/README.snakemake.md
new file mode 100644
index 0000000..050a959
--- /dev/null
+++ b/CHT/README.snakemake.md
@@ -0,0 +1,91 @@
+## Snakemake CHT pipeline
+
+[Snakemake](https://bitbucket.org/snakemake/snakemake/wiki/Home) is a
+workflow management system, designed to streamline the execution of
+software pipelines. We now provide a Snakemake rule file that can be
+used to run the entire Combined Haplotype Pipeline.
+
+For a more complete description of Snakemake see the
+[Snakemake tutorial](http://snakemake.bitbucket.org/snakemake-tutorial.html).
+
+## Installing Snakemake
+
+Snakemake requires python3, however the CHT pipeline requires
+python2. For this reason, if you are using
+[Anaconda](https://www.continuum.io/downloads), it is recommended that
+you create a [python3
+environment](http://conda.pydata.org/docs/py2or3.html#create-a-python-3-5-environment). For example you can create a python3.5 Anaconda environment with the following shell command (this only needs to be done once):
+
+        conda create -n py35 python=3.5 anaconda
+
+You can then activate the py35 environment, and install the latest version of
+Snakemake with the following commands:
+
+        source activate py35
+        conda install snakemake
+
+Then when you want to switch back to your default (e.g. python2) environment
+do the following:
+
+        source deactivate
+
+
+## Configuring the CHT pipeline
+
+The rules for the Snakemake tasks are defined in the [Snakefile](Snakefile).
+
+Configuration parameters for this Snakefile are read from the YAML file
+[snake_conf.yaml](snake_conf.yaml).
+
+Before running Snakemake edit this file to specify the location
+of all of the input directories and files that will be used by the pipeline.
+This includes locations of the impute2 SNP files, input BAM files etc.
+
+Importantly you must set `wasp_dir` to point to the location of WASP
+on your system, and set `py2` and `Rscript` to setup the environment
+for python and R (e.g. by modifying your PATH) and call the
+appropriate interpreter.  This is necessary because Snakemake is run
+using python3, but most of the scripts require python2.
+
+
+## Running the CHT pipeline
+
+Snakemake can be run as a single process or on a compute cluster with
+multiple jobs running simultaneuously. To run Snakemake on a single node
+you could do something like the following:
+
+        source activate py35
+        cd $WASP_DIR/CHT
+        snakemake
+
+We provide a script [run_snakemake.sh](run_snakemake.sh) to run Snakemake
+on a SGE compute cluster. You must be in a python3 environment to run this
+script, and the script must be run from a job submission host.
+
+        source activate py35
+        cd $WASP_DIR/CHT
+        ./run_snakemake.sh
+
+It should be possible to make simple modifications to this script to
+run on queue management systems other than SGE (e.g. LSF or Slurm).
+
+
+You should Snakemake from within a [Screen](https://www.gnu.org/software/screen/) virtual terminal or using [nohup](https://en.wikipedia.org/wiki/Nohup) so
+that if you are disconnected from the cluster, Snakemake will continue to run.
+
+At the conclusion of the pipeline, a QQPlot will be generated that summarizes
+the results of the CHT.
+
+
+## Debugging the CHT pipeline
+
+By default Snakemake will write an output and error file for each job
+to your home directory. These files will be named like `snakejob.<rulename>.<job_num>.sh.{e|o}<sge_jobid>`. For example:
+
+   	# contains error output for extract_haplotype_read_counts rule:
+   	snakejob.extract_haplotype_read_counts.13.sh.e4507125
+
+If a rule fails, you should check the appropriate output file to see what
+error occurred. A major benefit of Snakemake is that if you re-run snakemake
+after a job fails it will pickup where it left off.
+
diff --git a/CHT/Snakefile b/CHT/Snakefile
new file mode 100644
index 0000000..5946974
--- /dev/null
+++ b/CHT/Snakefile
@@ -0,0 +1,375 @@
+configfile: "snake_conf.yaml"
+
+import glob
+
+
+def get_individuals():
+    """read list of sample identifiers"""
+    if config['samples_file'].endswith(".gz"):
+        f = gzip.open(config['samples_file'])
+    else:
+        f = open(config['samples_file'])
+    samples = []
+    for line in f:
+        samples.append(line.strip())
+    return samples
+
+
+
+rule all:
+    input:
+        config['base_dir'] + "/CHT/qqplot.png"
+
+
+#### Make a QQPlot from CHT results
+        
+rule qqplot:
+    input:
+        [config['base_dir'] + "/CHT/cht_results.txt",
+        config['base_dir'] + "/CHT/cht_results_as.txt",
+        config['base_dir'] + "/CHT/cht_results_bnb.txt",
+        config['base_dir'] + "/CHT/cht_results_as_permuted.txt",
+        config['base_dir'] + "/CHT/cht_results_bnb_permuted.txt",
+        config['base_dir'] + "/CHT/cht_results_permuted.txt"]
+    output:
+        config['base_dir'] + "/CHT/qqplot.png"
+    shell:
+        "{config[Rscript]} --vanilla "
+        "{config[wasp_dir]}/CHT/qqplot.R {output[0]} {input}; "
+        "sleep 10; "
+        
+
+########## Generating HDF5 files for SNPs, genome sequence and read counts
+
+rule snp2h5:
+    """Convert impute data files to HDF5 format"""
+    input:
+        chrom=config['chrom_info'],
+        snps=glob.glob(config['snp_dir'] + "/chr*.impute2*gz")
+    output:
+        geno_prob=config['base_dir'] + "/snp_h5/geno_probs.h5",
+        snp_index=config['base_dir'] + "/snp_h5/snp_index.h5",
+        snp_tab=config['base_dir'] + "/snp_h5/snp_tab.h5",
+        haplotype=config['base_dir'] + "/snp_h5/haplotype.h5"
+    shell:
+        "mkdir -p {config[base_dir]}/snp_h5; "
+        "{config[wasp_dir]}/snp2h5/snp2h5 --chrom {input.chrom} "
+        "  --format impute "
+        "  --geno_prob {output.geno_prob} "
+        "  --snp_index {output.snp_index} "
+        "  --snp_tab {output.snp_tab} "
+        "  --haplotype {output.haplotype} "
+        "  {input.snps}"
+
+        
+rule fasta2h5:
+    """Create HDF5 file from genome FASTA files"""
+    input:
+        fasta=glob.glob(config['fasta_dir'] + "/chr*.fa*"),
+        chrom=config['chrom_info']
+    output:
+        config['base_dir'] + "/fasta_h5/seq.h5"
+    shell:
+        "mkdir -p {config[base_dir]}/fasta_h5; "
+        "{config[wasp_dir]}/snp2h5/fasta2h5 --chrom {input.chrom} "
+        "  --seq {output} {input.fasta}"
+
+
+rule bam2h5:
+    """Create HDF5 files of read counts from input BAM files containing 
+    aligned and filtered reads."""
+    input:
+        geno_prob=config['base_dir'] + "/snp_h5/geno_probs.h5",
+        snp_index=config['base_dir'] + "/snp_h5/snp_index.h5",
+        snp_tab=config['base_dir'] + "/snp_h5/snp_tab.h5",
+        haplotype=config['base_dir'] + "/snp_h5/haplotype.h5",
+        snp_samples=config['snp_samples'],
+        chrom=config['chrom_info'],
+        bam=lambda wildcards: glob.glob("%s/%s.*%s" % (config['bam_dir'],
+                                                       wildcards.individual,
+                                                       config['bam_postfix']))
+    output:
+        ref_as=config['base_dir'] + "/bam_h5/ref_as_counts.{individual}.h5",
+        alt_as=config['base_dir'] + "/bam_h5/alt_as_counts.{individual}.h5",
+        other_as=config['base_dir'] + "/bam_h5/other_as_counts.{individual}.h5",
+        read_counts=config['base_dir'] + "/bam_h5/read_counts.{individual}.h5"
+    shell:
+        "mkdir -p {config[base_dir]}/bam_h5; "
+        "{config[py2]} {config[wasp_dir]}/CHT/bam2h5.py --chrom {input.chrom} "
+        "  --snp_index {input.snp_index} "
+        "  --snp_tab {input.snp_tab} "
+        "  --haplotype {input.haplotype} "
+        "  --samples {input.snp_samples} "
+        "  --individual {wildcards.individual} "
+        "  --ref_as_counts {output.ref_as} "
+        "  --alt_as_counts {output.alt_as} "
+        "  --other_as_counts {output.other_as} "
+        "  --read_counts {output.read_counts} "
+        " {input.bam}"
+        
+
+
+########### make target regions, extract read counts from them
+
+rule get_target_regions:
+    """Extract 'peak' regions that have sufficient read depth
+    and count of allele specific reads. If different target 
+    regions are desired (e.g. exons), a different script will
+    need to be used."""
+    input:
+        samples=config['samples_file'],
+        snp_samples=config['snp_samples'],
+        snp_index=config['base_dir'] + "/snp_h5/snp_index.h5",
+        snp_tab=config['base_dir'] + "/snp_h5/snp_tab.h5",
+        haplotype=config['base_dir'] + "/snp_h5/haplotype.h5",
+        chrom=config['chrom_info'],
+        ref_as_counts=[config['base_dir'] + "/bam_h5/ref_as_counts.%s.h5" % x
+                       for x in get_individuals()]
+    output:
+        config['base_dir'] + "/target_regions.txt.gz"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/get_target_regions.py "
+        "  --target_region_size 2000 "
+        "  --min_as_count {config[min_as_count]} "
+        "  --min_het_count 1 "
+        "  --min_minor_allele_count 1 "
+        "  --chrom {input.chrom} "
+        "  --read_count_dir {config[base_dir]}/bam_h5 "
+        "  --individuals {input.samples} "
+        "  --samples {input.snp_samples} "
+        "  --snp_tab {input.snp_tab} "
+        "  --snp_index {input.snp_index} "
+        "  --haplotype {input.haplotype} "
+        "  --output_file {output}"
+
+
+rule extract_haplotype_read_counts:
+    """Extract haplotype read counts for target regions for each individual."""
+    input:
+        snp_samples=config['snp_samples'],
+        snp_index=config['base_dir'] + "/snp_h5/snp_index.h5",
+        snp_tab=config['base_dir'] + "/snp_h5/snp_tab.h5",
+        geno_prob=config['base_dir'] + "/snp_h5/geno_probs.h5",
+        haplotype=config['base_dir'] + "/snp_h5/haplotype.h5",
+        samples=config['samples_file'],
+        chrom=config['chrom_info'],
+        ref_as_counts=config['base_dir'] + "/bam_h5/ref_as_counts.{individual}.h5",
+        alt_as_counts=config['base_dir'] + "/bam_h5/alt_as_counts.{individual}.h5",
+        other_as_counts=config['base_dir'] + "/bam_h5/other_as_counts.{individual}.h5",
+        read_counts=config['base_dir'] + "/bam_h5/read_counts.{individual}.h5",
+        target_regions=config['base_dir'] + "/target_regions.txt.gz"
+    output:
+        config['base_dir'] + "/hap_read_counts/haplotype_read_counts.{individual}.txt.gz"
+    shell:
+        "mkdir -p {config[base_dir]}/hap_read_counts; "
+        "{config[py2]} {config[wasp_dir]}/CHT/extract_haplotype_read_counts.py "
+        "  --chrom {input.chrom} "
+        "  --snp_index {input.snp_index} "
+        "  --snp_tab {input.snp_tab} "
+        "  --geno_prob {input.geno_prob} "
+        "  --haplotype {input.haplotype} "
+        "  --samples {input.snp_samples} "
+        "  --individual {wildcards.individual} "
+        "  --ref_as_counts {input.ref_as_counts} "
+        "  --alt_as_counts {input.alt_as_counts} "
+        "  --other_as_counts {input.other_as_counts} "
+        "  --read_counts {input.read_counts} "
+        "  {input.target_regions} | gzip > {output}"
+
+    
+
+
+########### adjust heterozygote probabilities and read counts
+
+
+rule make_adj_in_out_files:
+    """makes input / output files containing lists of 
+    adjusted read count files"""
+    input:
+        ["%s/hap_read_counts/haplotype_read_counts.%s.txt.gz" %
+         (config['base_dir'], x) for x in get_individuals()]
+    output:
+        "%s/adjust_hap_read_counts/input.txt" % config['base_dir'],
+        "%s/adjust_hap_read_counts/output.txt" % config['base_dir']
+    shell:
+        "mkdir -p {config[base_dir]}/adjust_hap_read_counts; "
+        "ls {input} > {output[0]}; "
+        "ls {input} | sed 's/hap_read_counts/adjust_hap_read_counts/' | "
+        "             sed 's/.txt/.adjusted.txt/' > {output[1]}; "
+        "sleep 10; "
+        
+        
+rule adjust_read_counts:
+    input:
+        in_file="%s/adjust_hap_read_counts/input.txt" % config['base_dir'],
+        out_file="%s/adjust_hap_read_counts/output.txt" % config['base_dir'],
+        seq=config['base_dir'] + "/fasta_h5/seq.h5"
+    output:
+        ["%s/adjust_hap_read_counts/haplotype_read_counts.%s.adjusted.txt.gz" %
+         (config['base_dir'], ind) for ind in get_individuals()]
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/update_total_depth.py "
+        "    --seq {input.seq} "
+        "   {input.in_file} {input.out_file}"
+
+
+
+rule update_het_probs:
+    """adjust heterozygote probabilities in haplotype read count
+    files"""
+    input:
+        hap_read_counts="%s/adjust_hap_read_counts/"
+            "haplotype_read_counts.{individual}.adjusted.txt.gz" % config['base_dir'],
+        ref_as_counts="%s/bam_h5/ref_as_counts.{individual}.h5" % config['base_dir'],
+        alt_as_counts="%s/bam_h5/alt_as_counts.{individual}.h5" % config['base_dir']
+    output:
+        "%s/update_het_probs/haplotype_read_counts.{individual}.adjusted.hetp.txt.gz" % \
+            config['base_dir']
+    shell:
+        "mkdir -p {config[base_dir]}/update_het_probs; "
+        "{config[py2]} {config[wasp_dir]}/CHT/update_het_probs.py "
+        "  --ref_as_counts {input.ref_as_counts} "
+        "  --alt_as_counts {input.alt_as_counts} "
+        " {input.hap_read_counts} {output}"
+    
+
+
+rule make_cht_input_files:
+    """make the input file containing list of adjusted read count files
+    that is used for running the combined test"""
+    input:
+        expand(config['base_dir'] + "/update_het_probs/haplotype_read_counts."
+               "{individual}.adjusted.hetp.txt.gz", individual=get_individuals())
+    output:
+        "%s/CHT/cht_input_files.txt" % config['base_dir']
+    shell:
+        "mkdir -p {config[base_dir]}/CHT; "
+        "ls {input} > {output}; "
+        "sleep 10;"
+
+
+        
+        
+########### Fitting dispersion coefficients for combined test
+    
+
+
+
+rule fit_bnb_coef:
+    """estimate dispersion parameters for beta-negative binomial
+    part of combined test"""
+    input:
+        cht_input = config['base_dir'] + "/CHT/cht_input_files.txt"
+    output:
+        config['base_dir'] + "/CHT/bnb_coef.txt"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/fit_bnb_coefficients.py "
+        "  --min_as_counts {config[min_as_count]}"
+        "  --sample 2000 --seed 1234 {input.cht_input} {output}"
+
+
+rule fit_as_coef:
+    """estimate dispersion parameters for allele-specific
+    part of combined test"""
+    input:
+        cht_input = config['base_dir'] + "/CHT/cht_input_files.txt"
+    output:
+        config['base_dir'] + "/CHT/as_coef.txt"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/fit_as_coefficients.py "
+        "  {input.cht_input} {output}"
+
+
+
+########## Running the combined test on real and permuted data
+
+        
+rule combined_test:
+    input:
+        as_coef = config['base_dir'] + "/CHT/as_coef.txt",
+        bnb_coef = config['base_dir'] + "/CHT/bnb_coef.txt",
+        cht_input = config['base_dir'] + "/CHT/cht_input_files.txt"
+    output:
+        results = config['base_dir'] + "/CHT/cht_results.txt"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/combined_test.py "
+        "  --min_as_counts {config[min_as_count]}"
+        "  --bnb_disp {input.bnb_coef} --as_disp {input.as_coef}"
+        "  {input.cht_input} {output.results}"
+
+
+rule as_test:
+    """run just the allele-specific part of the combined test"""
+    input:
+        as_coef = config['base_dir'] + "/CHT/as_coef.txt",
+        cht_input = config['base_dir'] + "/CHT/cht_input_files.txt"
+    output:
+        results = config['base_dir'] + "/CHT/cht_results_as.txt"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/combined_test.py"
+        "  --min_as_counts {config[min_as_count]}"
+        "  --as_only --as_disp {input.as_coef}"
+        "  {input.cht_input} {output.results}"
+
+
+
+rule bnb_test:
+    """run just the beta-negative-binomial part of the combined test"""
+    input:
+        bnb_coef = config['base_dir'] + "/CHT/bnb_coef.txt",
+        cht_input = config['base_dir'] + "/CHT/cht_input_files.txt"
+    output:
+        results = config['base_dir'] + "/CHT/cht_results_bnb.txt"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/combined_test.py "
+        "  --min_as_counts {config[min_as_count]}"
+        "  --bnb_only --bnb_disp {input.bnb_coef}"
+        "  {input.cht_input} {output.results}" 
+
+
+rule as_test_permuted:
+    """run just the allele-specific part of the combined test
+    on permuted genotypes"""
+    input:
+        as_coef = config['base_dir'] + "/CHT/as_coef.txt",
+        cht_input = config['base_dir'] + "/CHT/cht_input_files.txt"
+    output:
+        results = config['base_dir'] + "/CHT/cht_results_as_permuted.txt"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/combined_test.py --shuffle "
+        "  --min_as_counts {config[min_as_count]}"
+        "  --as_only --as_disp {input.as_coef}"
+        "  {input.cht_input} {output.results}"
+
+
+rule bnb_test_permuted:
+    """run just the beta-negative-binomial part of the combined test with
+    permuted genotypes
+    """
+    input:
+        bnb_coef = config['base_dir'] + "/CHT/bnb_coef.txt",
+        cht_input = config['base_dir'] + "/CHT/cht_input_files.txt"
+    output:
+        results = config['base_dir'] + "/CHT/cht_results_bnb_permuted.txt"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/combined_test.py --shuffle"
+        "  --min_as_counts {config[min_as_count]}"
+        "  --bnb_only --bnb_disp {input.bnb_coef}"
+        "  {input.cht_input} {output.results}"
+
+
+
+rule combined_test_permuted:
+    """Run the combined test on permuted genotypes"""
+    input:
+        as_coef = config['base_dir'] + "/CHT/as_coef.txt",
+        bnb_coef = config['base_dir'] + "/CHT/bnb_coef.txt",
+        cht_input = config['base_dir'] + "/CHT/cht_input_files.txt"
+    output:
+        results = config['base_dir'] + "/CHT/cht_results_permuted.txt"
+    shell:
+        "{config[py2]} {config[wasp_dir]}/CHT/combined_test.py --shuffle"
+        "  --min_as_counts {config[min_as_count]}"
+        "  --bnb_disp {input.bnb_coef} --as_disp {input.as_coef}"
+        "  {input.cht_input} {output.results}"
+
diff --git a/CHT/bam2h5.py b/CHT/bam2h5.py
index 5cf7c14..c9fa42e 100755
--- a/CHT/bam2h5.py
+++ b/CHT/bam2h5.py
@@ -22,11 +22,11 @@
 SNP HDF5 data files. The read counts are stored in specified HDF5 output
 files.
 
-Additionally counts of all reads are stored in another track (at the 
+Additionally counts of all reads are stored in another track (at the
 left-most position of the reads).
 
 This program does not perform filtering of reads based on mappability.
-It is assumed that the inpute BAM files are filtered appropriately prior to 
+It is assumed that the inpute BAM files are filtered appropriately prior to
 calling this script.
 
 Reads that overlap known indels are not included in allele-specific
@@ -37,7 +37,7 @@
 
 BAM Files:
      Aligned reads are read from one or more BAM files. The provided
-     BAM files must be sorted and indexed. 
+     BAM files must be sorted and indexed.
 
 Input Options:
      --chrom CHROM_TXT_FILE [required]
@@ -62,8 +62,8 @@
      --haplotype HAPLOTYPE_H5_FILE [optional]
        Path to HDF5 file to read phased haplotypes from.
        If supplied, when read overlaps multiple SNPs counts are randomly
-       assigned to ONE of the overlapping HETEROZYGOUS SNPs; if not supplied 
-       counts are randomly assigned to ONE of overlapping SNPs (regardless of 
+       assigned to ONE of the overlapping HETEROZYGOUS SNPs; if not supplied
+       counts are randomly assigned to ONE of overlapping SNPs (regardless of
        their genotype).
 
      --samples SAMPLES_TXT_FILE [optional]
@@ -82,7 +82,7 @@
      --data_type uint8|uint16
        Data type of stored counts; uint8 takes up less disk
        space but has a maximum value of 255 (default=uint8).
-     
+
      --ref_as_counts REF_AS_COUNT_H5_FILE [required]
        Path to HDF5 file to write counts of reads that match reference allele.
        Allele-specific counts are stored at the position of the SNP.
@@ -127,7 +127,7 @@
 BAM_CEQUAL     = 7 # =
 BAM_CDIFF      = 8 # X
 
-BAM_CIGAR_DICT = {0 : "M", 
+BAM_CIGAR_DICT = {0 : "M",
                   1 : "I",
                   2 : "D",
                   3 : "N",
@@ -151,9 +151,9 @@ def create_carray(h5f, chrom, data_type):
         atom = tables.UInt16Atom(dflt=0)
     else:
         raise NotImplementedError("unsupported datatype %s" % data_type)
-        
+
     zlib_filter = tables.Filters(complevel=1, complib="zlib")
-    
+
     # create CArray for this chromosome
     shape = [chrom.length]
     carray = h5f.createCArray(h5f.root, chrom.name,
@@ -175,7 +175,7 @@ def is_indel(snp):
 
 
 def dump_read(f, read):
-    cigar_str = " ".join(["%s:%d" % (BAM_CIGAR_DICT[c[0]], c[1]) 
+    cigar_str = " ".join(["%s:%d" % (BAM_CIGAR_DICT[c[0]], c[1])
                           for c in read.cigar])
 
     f.write("pos: %d\n"
@@ -190,7 +190,7 @@ def dump_read(f, read):
             "seq: %s\n"
             % (read.pos, read.aend, read.alen, read.qstart, read.qend,
                read.qlen, read.rlen, read.tlen, cigar_str, read.seq))
-    
+
 
 
 
@@ -231,8 +231,8 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
     Returns a tuple containing 4 elements: [0] the index of the SNP in
     the SNP table, [1] the offset into the read sequence, [2] flag
     indicating whether the read was 'split' (i.e. was a spliced
-    read), [3] flag indicating whether read overlaps known indel. 
-    If there are no overlapping SNPs or the read cannot be processed, 
+    read), [3] flag indicating whether read overlaps known indel.
+    If there are no overlapping SNPs or the read cannot be processed,
     (None, None, is_split, overlap_indel) is returned instead.
     """
     read_offsets = []
@@ -244,7 +244,7 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
     n_match_segments = 0
     is_split = False
     overlap_indel = False
-    
+
     for cig in read.cigar:
         op = cig[0]
         op_len = cig[1]
@@ -253,7 +253,7 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
             # this is a block of match/mismatch in read alignment
             read_end = read_start_idx + op_len
             genome_end = genome_start_idx + op_len
-            
+
             # get offsets of any SNPs that this read overlaps
             idx = snp_index_array[genome_start_idx:genome_end]
             is_def = np.where(idx != SNP_UNDEF)[0]
@@ -262,14 +262,14 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
 
             read_start_idx = read_end
             genome_start_idx = genome_end
-            
+
             n_match_segments += 1
         elif op == BAM_CREF_SKIP:
             # spliced read, skip over this region of genome
             genome_start_idx += op_len
             is_split = True
         elif op == BAM_CSOFT_CLIP:
-            # end of read is soft-clipped, which means it is 
+            # end of read is soft-clipped, which means it is
             # present in read, but not used in alignment
             read_start_idx += op_len
         elif op == BAM_CHARD_CLIP:
@@ -278,15 +278,15 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
             pass
         else:
             sys.stderr.write("skipping because contains CIGAR code %s "
-                             " which is not currently implemented" % 
+                             " which is not currently implemented" %
                              BAM_CIGAR_DICT[op])
-            
+
     # are any of the SNPs indels? If so, discard.
     for i in snp_idx:
         if is_indel(snp_tab[i]):
             overlap_indel = True
             return (None, None, is_split, overlap_indel)
-            
+
     n_overlap_snps = len(read_offsets)
     if n_overlap_snps == 0:
         # no SNPs overlap this read
@@ -294,7 +294,7 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
 
     if hap_tab:
         # genotype info is provided by haplotype table
-        # pull out subset of overlapping SNPs that are heterozygous 
+        # pull out subset of overlapping SNPs that are heterozygous
         # in this individual
         het_read_offsets = []
         het_snp_idx = []
@@ -306,7 +306,7 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
                                  "individuals in haplotype_tab (%d). probably "
                                  "need to specify --population or use a different "
                                  "--samples_tab" % (ind_idx, hap_tab.shape[1]/2))
-            
+
             if haps[0] != haps[1]:
                 # this is a het
                 het_read_offsets.append(read_offset)
@@ -321,12 +321,12 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
         if n_overlap_hets == 1:
             # only one overlapping SNP is a het
             return (het_snp_idx[0], het_read_offsets[0], is_split, overlap_indel)
-        
+
         # choose ONE overlapping HETEROZYGOUS SNP randomly to add counts to
         # we don't want to count same read multiple times
-        r = np.random.randint(0, n_overlap_hets-1)
+        r = np.random.randint(0, n_overlap_hets)
         return (het_snp_idx[r], het_read_offsets[r], is_split, overlap_indel)
-    
+
     else:
         # We don't have haplotype tab, so we don't know which SNPs are
         # heterozygous in this individual. But we can still tell
@@ -335,16 +335,16 @@ def choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx):
         if n_overlap_snps == 1:
             return (snp_idx[0], read_offsets[0], is_split, overlap_indel)
         else:
-            r = np.random.randint(0, n_overlap_snps-1)
+            r = np.random.randint(0, n_overlap_snps)
             return (snp_idx[r], read_offsets[r], is_split, overlap_indel)
 
 
 
-    
+
 def add_read_count(read, chrom, ref_array, alt_array, other_array,
-                   read_count_array, snp_index_array, snp_tab, hap_tab, 
+                   read_count_array, snp_index_array, snp_tab, hap_tab,
                    warned_pos, max_count, ind_idx):
-    
+
     # pysam positions start at 0
     start = read.pos+1
     end = read.aend
@@ -360,15 +360,15 @@ def add_read_count(read, chrom, ref_array, alt_array, other_array,
         sys.stderr.write("WARNING skipping read: handling of "
                          "partially mapped reads not implemented\n")
         return
-    
-    # look for SNPs that overlap mapped read position, and if there 
+
+    # look for SNPs that overlap mapped read position, and if there
     # are more than one, choose one at random
     snp_idx, read_offset, is_split, overlap_indel = \
       choose_overlap_snp(read, snp_tab, snp_index_array, hap_tab, ind_idx)
 
     if overlap_indel:
         return
-      
+
     # store counts of reads at start position
     if read_count_array[start-1] < max_count:
         read_count_array[start-1] += 1
@@ -377,16 +377,16 @@ def add_read_count(read, chrom, ref_array, alt_array, other_array,
             sys.stderr.write("WARNING read count at position %d "
                              "exceeds max %d\n" % (start, max_count))
             warned_pos[start] = True
-        
-      
+
+
     if snp_idx is None:
         return
-    
+
     snp = snp_tab[snp_idx]
-    
+
     base = read.seq[read_offset]
     snp_pos = snp['pos']
-    
+
     if base == snp['allele1']:
         # matches reference allele
         if ref_array[snp_pos-1] < max_count:
@@ -410,9 +410,9 @@ def add_read_count(read, chrom, ref_array, alt_array, other_array,
         elif not snp_pos in warned_pos:
             sys.stderr.write("WARNING other allele count at position %d "
                              "exceeds max %d\n" % (snp_pos, max_count))
-    
-        
-    
+
+
+
 
 
 
@@ -432,7 +432,7 @@ def parse_args():
                         metavar="CHROM_TXT_FILE",
                         required=True)
 
-    
+
     parser.add_argument("--snp_index",
                         help="Path to HDF5 file containing SNP index. The "
                         "SNP index is used to convert the genomic position "
@@ -447,7 +447,7 @@ def parse_args():
                         "(rs_id), position, allele1, allele2.",
                         metavar="SNP_TABLE_H5_FILE",
                         required=True)
-                        
+
     parser.add_argument("--haplotype",
                         help=" Path to HDF5 file to read phased haplotypes "
                         "from. If supplied, when read overlaps multiple SNPs "
@@ -476,14 +476,14 @@ def parse_args():
                         "with --samples argument.",
                         metavar="INDIVIDUAL",
                         default=None)
-    
+
     parser.add_argument("--data_type",
                         help="Data type of counts stored in HDF5 files. "
                         "uint8 requires less disk space but has a "
                         "maximum value of 255."
-                        "(default=uint8)", choices=("uint8", "uint16"), 
+                        "(default=uint8)", choices=("uint8", "uint16"),
                         default="uint8")
-             
+
     parser.add_argument("--ref_as_counts",
                         help="Path to HDF5 file to write counts of reads "
                         "that match reference allele. Allele-specific counts "
@@ -492,14 +492,14 @@ def parse_args():
                         metavar="REF_AS_COUNT_H5_FILE",
                         required=True)
 
-    parser.add_argument("--alt_as_counts", 
+    parser.add_argument("--alt_as_counts",
                         help="Path to HDF5 file to write counts of reads "
                         "that match alternate allele. Allele-specific counts "
                         "are stored at the position of the SNP.",
                         metavar="ALT_AS_COUNT_H5_FILE",
                         required=True)
 
-    parser.add_argument("--other_as_counts", 
+    parser.add_argument("--other_as_counts",
                         help="Path to HDF5 file to write counts of reads "
                         "that match neither reference nor alternate allele. "
                         "Allele-specific counts are stored at the position "
@@ -514,10 +514,10 @@ def parse_args():
                        "of the mapped read.",
                        metavar="READ_COUNT_H5_FILE",
                        required=True)
-                       
+
     parser.add_argument("bam_filenames", action="store", nargs="+",
                         help="BAM file(s) to read mapped reads from. "
-                        "BAMs must be sorted and indexed.")    
+                        "BAMs must be sorted and indexed.")
 
     args = parser.parse_args()
 
@@ -525,10 +525,10 @@ def parse_args():
             parser.error("--indidivual and --samples arguments "
                          "must also be provided when --haplotype argument "
                          "is provided")
-    
+
 
     return args
-    
+
 
 
 
@@ -541,14 +541,14 @@ def lookup_individual_index(samples_file, ind_name, population=None):
         p = population.lower()
     else:
         p = None
-    
+
     idx = 0
     for line in f:
         if line.startswith("samples"):
             # header line
             continue
-        
-        words = line.rstrip().split()        
+
+        words = line.rstrip().split()
         name = words[0].replace("NA", "")
 
         if len(words) > 1:
@@ -560,18 +560,18 @@ def lookup_individual_index(samples_file, ind_name, population=None):
             group = words[2].lower()
         else:
             group = ""
-        
+
         # if specified, only consider a single population or group
         if p and pop != p and group != p:
             continue
-        
+
         if name == ind_name:
             f.close()
             return idx
-    
+
         idx += 1
-    
-    
+
+
     raise ValueError("individual %s (with population=%s) "
                      "is not in samples file %s" %
                      (ind_name, population, samples_file))
@@ -591,7 +591,7 @@ def main():
     else:
         hap_h5 = None
         ind_idx = None
-    
+
     ref_count_h5 = tables.openFile(args.ref_as_counts, "w")
     alt_count_h5 = tables.openFile(args.alt_as_counts, "w")
     other_count_h5 = tables.openFile(args.other_as_counts, "w")
@@ -600,10 +600,10 @@ def main():
     output_h5 = [ref_count_h5, alt_count_h5, other_count_h5, read_count_h5]
 
     chrom_dict = {}
-        
+
     # initialize every chromosome in output files
     chrom_list = chromosome.get_all_chromosomes(args.chrom)
-    
+
     for chrom in chrom_list:
         for out_file in output_h5:
             create_carray(out_file, chrom, args.data_type)
@@ -619,7 +619,7 @@ def main():
         dtype = np.uint16
     else:
         raise NotImplementedError("unsupported datatype %s" % args.data_type)
-    
+
     for chrom in chrom_list:
         sys.stderr.write("%s\n" % chrom.name)
 
@@ -638,18 +638,18 @@ def main():
             hap_tab = hap_h5.getNode("/%s" % chrom.name)
         else:
             hap_tab = None
-        
+
         # initialize count arrays for this chromosome to 0
         ref_carray = get_carray(ref_count_h5, chrom)
         alt_carray = get_carray(alt_count_h5, chrom)
         other_carray = get_carray(other_count_h5, chrom)
         read_count_carray = get_carray(read_count_h5, chrom)
-        
+
         ref_array = np.zeros(chrom.length, dtype)
         alt_array = np.zeros(chrom.length, dtype)
         other_array = np.zeros(chrom.length, dtype)
         read_count_array = np.zeros(chrom.length, dtype)
-        
+
         # loop over all BAM files, pulling out reads
         # for this chromosome
         for bam_filename in args.bam_filenames:
@@ -659,16 +659,16 @@ def main():
 
             for read in get_sam_iter(samfile, chrom):
                 count += 1
-                if count == 10000:                        
+                if count == 10000:
                     sys.stderr.write(".")
                     count = 0
 
-                add_read_count(read, chrom, ref_array, alt_array, 
-                               other_array, read_count_array, 
+                add_read_count(read, chrom, ref_array, alt_array,
+                               other_array, read_count_array,
                                snp_index_array, snp_tab, hap_tab,
                                warned_pos, max_count, ind_idx)
 
-            # store results for this chromosome        
+            # store results for this chromosome
             ref_carray[:] = ref_array
             alt_carray[:] = alt_array
             other_carray[:] = other_array
@@ -678,22 +678,22 @@ def main():
             samfile.close()
 
     # set track statistics and close HDF5 files
-    
+
     sys.stderr.write("setting statistics for each chromosome\n")
     for h5f in output_h5:
         chromstat.set_stats(h5f, chrom_list)
         h5f.close()
 
     snp_tab_h5.close()
-    snp_index_h5.close()    
+    snp_index_h5.close()
     if hap_h5:
         hap_h5.close()
 
 
     sys.stderr.write("done\n")
 
-    
+
 main()
-        
-        
-    
+
+
+
diff --git a/CHT/chromosome.py b/CHT/chromosome.py
index d1c9d12..969cf24 100644
--- a/CHT/chromosome.py
+++ b/CHT/chromosome.py
@@ -187,7 +187,7 @@ def get_chromosome(filename, name):
         
 def get_all_chromosomes(filename):
     if filename.endswith(".gz"):
-        f = gzip.open(filename)
+        f = gzip.open(filename, "rt")
     else:
         f = open(filename)
 
diff --git a/CHT/combined_test.py b/CHT/combined_test.py
index 3ec7839..4c9314b 100644
--- a/CHT/combined_test.py
+++ b/CHT/combined_test.py
@@ -20,22 +20,20 @@
 import argparse
 
 from scipy.optimize import *
-from scipy import cast
 from scipy.special import gammaln
 from scipy.special import betaln
 import scipy.stats
 
 import numpy as np
-from random import shuffle
-from random import randint
 
+import random
 
 # OPTIMIZER="BFGS"
 OPTIMIZER="Nelder-Mead"
 
 
 class TestSNP:
-    def __init__(self, name, geno_hap1, geno_hap2, AS_target_ref, AS_target_alt, 
+    def __init__(self, name, geno_hap1, geno_hap2, AS_target_ref, AS_target_alt,
                  hetps, totals, counts):
         self.name = name
         self.geno_hap1 = geno_hap1
@@ -46,9 +44,9 @@ def __init__(self, name, geno_hap1, geno_hap2, AS_target_ref, AS_target_alt,
         self.totals = totals
         self.counts = counts
 
-        
+
     def is_het(self):
-        """returns True if the test SNP is heterozygous"""        
+        """returns True if the test SNP is heterozygous"""
         return self.geno_hap1 != self.geno_hap2
 
     def is_homo_ref(self):
@@ -66,7 +64,7 @@ def open_input_files(in_filename):
         sys.stderr.write("input file %s does not exist or is not a regular file\n" %
                          in_filename)
         exit(2)
-    
+
     # read file that contains list of input files
     in_file = open(in_filename)
 
@@ -75,37 +73,37 @@ def open_input_files(in_filename):
         # open each input file and read first line
         filename = line.rstrip()
         if not filename or not os.path.exists(filename) or not os.path.isfile(filename):
-            sys.stderr.write("input file '%s' does not exist or is not a regular file\n" 
+            sys.stderr.write("input file '%s' does not exist or is not a regular file\n"
                              % line)
             exit(2)
         if filename.endswith(".gz"):
-            f = gzip.open(filename)
+            f = gzip.open(filename, "rt")
         else:
-            f = open(filename)
-            
+            f = open(filename, "r")
+
         # skip header
         f.readline()
 
         infiles.append(f)
     in_file.close()
-    
+
     if len(infiles) == 0:
         sys.stderr.write("no input files specified in file '%s'\n" % options.infile_list)
         exit(2)
 
     return infiles
-    
+
 
 
 def write_header(outfile):
     outfile.write("\t".join(["TEST.SNP.CHROM", "TEST.SNP.POS",
-                             "LOGLIKE.NULL", "LOGLIKE.ALT", 
-                             "CHISQ", "P.VALUE", "ALPHA", "BETA", 
-                             "PHI", "TOTAL.AS.READ.COUNT", 
+                             "LOGLIKE.NULL", "LOGLIKE.ALT",
+                             "CHISQ", "P.VALUE", "ALPHA", "BETA",
+                             "PHI", "TOTAL.AS.READ.COUNT",
                              "TOTAL.READ.COUNT"]) + "\n")
 
 
-def read_bnb_sigmas(options, infiles):    
+def read_bnb_sigmas(options, infiles):
     """Read overdispersion parameters for beta-negative binomial.
     Expect one for each individual."""
     if (options.bnb_disp):
@@ -125,7 +123,7 @@ def read_bnb_sigmas(options, infiles):
         bnb_sigmas = [0.001]*len(infiles)
 
     return bnb_sigmas
-    
+
 
 
 def read_as_sigmas(options, infiles):
@@ -140,18 +138,18 @@ def read_as_sigmas(options, infiles):
             val = np.float64(line.strip())
             if val < 0.0 or val > 1.0:
                 raise ValueError("expected as_sigma values to be "
-                                 " in range 0.0-1.0, but got %g" % 
+                                 " in range 0.0-1.0, but got %g" %
                                  val)
             as_sigmas.append(np.float64(line.strip()))
             line = disp_file.readline()
-            
+
         disp_file.close()
 
         if len(as_sigmas) != len(infiles):
             raise ValueError("expected %d values in as_disp file "
                              "(one for each input file) but got "
                              "%d" % (len(infiles), len(as_sigmas)))
-        
+
     else:
         as_sigmas = [0.001] * len(infiles)
 
@@ -159,40 +157,40 @@ def read_as_sigmas(options, infiles):
 
 
 
-def write_results(outfile, snpinfo, loglike1par, loglike2par, 
+def write_results(outfile, snpinfo, loglike1par, loglike2par,
                   best2par, totcounts, all_counts):
-    """Write result to output file. Tab-delimited columns are: 
-      1. chromosome, 
-      2. SNP position, 
+    """Write result to output file. Tab-delimited columns are:
+      1. chromosome,
+      2. SNP position,
       3. Log likelihood 1 parameter model (Null)
       4. Log likelihood 2 parameter model (Alternative)
-      3. Chi-squared statistic, 
+      3. Chi-squared statistic,
       4. P-value
-      5. alpha parameter estimate (expression level 
+      5. alpha parameter estimate (expression level
          of reference allele)
-      6. beta parameter estimate (expression level of 
+      6. beta parameter estimate (expression level of
          alternative allele)
-      7. phi parameter estimate (beta-negative-binomial 
-         overdispersion 
+      7. phi parameter estimate (beta-negative-binomial
+         overdispersion
          parameter for this region)
       8. total number of allele-specific read counts for this
          region summed across individuals
-      9. total number of mapped reads for this region, 
+      9. total number of mapped reads for this region,
          summed across individuals"""
 
     # compute likelihood ratio test statistic:
     chisq = 2 * (loglike1par - loglike2par)
-    pval = (1-scipy.stats.chi2.cdf(chisq,1)), 
-         
+    pval = (1-scipy.stats.chi2.cdf(chisq,1)),
+
     outfile.write("\t".join([snpinfo[0][0], snpinfo[0][1],
                              "%.2f" % -loglike1par,
                              "%.2f" % -loglike2par,
                              "%.3f" % chisq,
                              "%g" % pval,
-                             "%.3f" % best2par[0], 
-                             "%.3f" % best2par[1], 
-                             "%g" % best2par[2], 
-                             "%d" % totcounts, 
+                             "%.3f" % best2par[0],
+                             "%.3f" % best2par[1],
+                             "%g" % best2par[2],
+                             "%d" % totcounts,
                              "%d" % all_counts]) + '\n')
     outfile.flush()
 
@@ -201,12 +199,12 @@ def write_empty_result(outfile, snpinfo):
     """Write all zeros in the even that the test failed"""
     outfile.write("\t".join([snpinfo[0][0], snpinfo[0][1], "0", "0",
                              "0", "NA", "0", "0", "0", "0"]) + '\n')
-    
+
 
 
 def main():
     options = parse_options()
-    
+
     if options.pc_file:
         pc_matrix = load_covariates(options.pc_file)
         num_pcs = options.num_pcs
@@ -227,9 +225,6 @@ def main():
 
         bench_file.write("TEST.TYPE TIME\n")
 
-    
-        
-        
     write_header(outfile)
 
     # read list of input files (one for each individual)
@@ -239,7 +234,7 @@ def main():
     bnb_sigmas = read_bnb_sigmas(options, infiles)
     as_sigmas = read_as_sigmas(options, infiles)
 
-    
+
     # add first row of each input file to snpinfo list
     snpinfo = []
     for f in infiles:
@@ -247,8 +242,10 @@ def main():
 
     row_count = 0
     finished=False
-        
-    while not finished: 
+
+    options.dup_snp_warn = True
+    
+    while not finished:
         try:
             test_snps = []
             # parse test SNP and associated info from input file row
@@ -257,16 +254,16 @@ def main():
 
             # how many allele-specific reads are there across all linked SNPs and
             # and individuals?
-            totcounts = sum([np.sum(x.AS_target_ref) + np.sum(x.AS_target_alt) 
+            totcounts = sum([np.sum(x.AS_target_ref) + np.sum(x.AS_target_alt)
                              for x in test_snps])
 
-            
+
             all_counts = sum([test_snps[i].counts for i in range(len(test_snps))])
 
             if totcounts < options.min_as_counts:
                 if options.verbose:
                     sys.stderr.write("-----\nskipping SNP %s because "
-                                     "total AS counts %d <= %d\n" % 
+                                     "total AS counts %d <= %d\n" %
                                      (test_snps[0].name, totcounts, options.min_as_counts))
 
                 # skip, not enough allele-specific counts
@@ -281,15 +278,15 @@ def main():
 
             if options.verbose:
                 sys.stderr.write("-----\ntesting SNP %s\n" % test_snps[0].name)
-            
+
             row_count+=1
-            old_genos = [test_snps[y].geno_hap1 + test_snps[y].geno_hap2 
+            old_genos = [test_snps[y].geno_hap1 + test_snps[y].geno_hap2
                          for y in range(len(test_snps))]
-            
+
             if options.shuffle:
                 # permute genotypes
                 perm = range(len(test_snps))
-                shuffle(perm)
+                random.shuffle(perm)
                 geno1temp = [test_snps[y].geno_hap1 for y in perm]
                 geno2temp = [test_snps[y].geno_hap2 for y in perm]
                 for i in range(len(test_snps)):
@@ -299,17 +296,17 @@ def main():
 
             if options.benchmark:
                 # start timing test for NULL model
-                bench_file.write("null %s %s %d %d " % (snpinfo[0][0], snpinfo[0][1], 
+                bench_file.write("null %s %s %d %d " % (snpinfo[0][0], snpinfo[0][1],
                                                         totcounts, all_counts))
                 bench_file.flush()
             t1 = time.time()
-            
+
             starting_gene = [np.float64(x) for x in [0.1, 0.001]]
             maxlike = 10000000000
-            
+
             for start in starting_gene:
                 starts = [np.float64(0.5), np.float64(start)]
-                
+
                 # regress against the covariates and get residuals
                 #fit_cov(test_snps,cov_table)
 
@@ -317,7 +314,7 @@ def main():
                 res = minimize(ll_one, starts, args=(test_snps, True, #options.is_bnb_only,
                                                      options.is_as_only,
                                                      bnb_sigmas,
-                                                     as_sigmas, 
+                                                     as_sigmas,
                                                      options.read_error_rate,
                                                      [],
                                                      pc_matrix),
@@ -325,41 +322,41 @@ def main():
                                 method=OPTIMIZER)
 
                 new_par = res.x
-                
+
                 new_loglike = ll_one(new_par, test_snps, options.is_bnb_only,
                                      options.is_as_only, bnb_sigmas,
                                      as_sigmas, options.read_error_rate,
                                      [], pc_matrix)
                 if new_loglike < maxlike:
                     starting_par = new_par
-                
+
             pc_coefs=[]
             for pc in range(num_pcs):
-                res = minimize(ll_pc, [np.float64(0)], 
+                res = minimize(ll_pc, [np.float64(0)],
                                args=(starting_par, test_snps, True, #options.is_bnb_only,
-                                     options.is_as_only, bnb_sigmas, as_sigmas, 
+                                     options.is_as_only, bnb_sigmas, as_sigmas,
                                      options.read_error_rate, pc_coefs, pc_matrix),
                                options={"maxiter" : 50000, "disp" : options.verbose},
                                method=OPTIMIZER)
 
                 new_coef = res.x
                 pc_coefs = np.concatenate([pc_coefs, new_coef])
-                
-            res = minimize(ll_one, starting_par, 
+
+            res = minimize(ll_one, starting_par,
                            args=(test_snps, options.is_bnb_only, options.is_as_only, bnb_sigmas,
                                  as_sigmas, options.read_error_rate, pc_coefs, pc_matrix),
                            options={"maxiter" : 50000, "disp" : options.verbose},
                            method=OPTIMIZER)
             best1par = res.x
-            
+
             time_taken = time.time() - t1
             if options.verbose:
                 sys.stderr.write("null model optimization took %.3fs\n" % time_taken)
             if options.benchmark:
                 bench_file.write("%.3f\n" % time_taken)
                 bench_file.flush()
-            
-                
+
+
             loglike1par = ll_one(best1par, test_snps, options.is_bnb_only,
                                  options.is_as_only, bnb_sigmas,
                                  as_sigmas, options.read_error_rate,
@@ -370,7 +367,7 @@ def main():
 
             if options.benchmark:
                 # start timing test for ALT model
-                bench_file.write("alt %s %s %d %d " % (snpinfo[0][0], snpinfo[0][1], 
+                bench_file.write("alt %s %s %d %d " % (snpinfo[0][0], snpinfo[0][1],
                                                        totcounts, all_counts))
                 bench_file.flush()
 
@@ -391,15 +388,15 @@ def main():
             if options.benchmark:
                 bench_file.write("%.3f\n" % time_taken)
                 bench_file.flush()
-            
+
             loglike2par = ll_two(best2par, test_snps, options.is_bnb_only,
                                  options.is_as_only, bnb_sigmas,
                                  as_sigmas, options.read_error_rate,
                                  pc_coefs, pc_matrix)
-            
-            write_results(outfile, snpinfo, loglike1par, loglike2par, best2par, 
+
+            write_results(outfile, snpinfo, loglike1par, loglike2par, best2par,
                           totcounts, all_counts)
-            
+
         except Exception as e:
             write_empty_result(outfile, snpinfo)
             # an error occured, write to output file, but put 0s for all params and
@@ -420,57 +417,57 @@ def main():
 
 def parse_options():
     parser=argparse.ArgumentParser()
-    parser.add_argument("-a", "--as_only", 
-                        action='store_true', 
+    parser.add_argument("-a", "--as_only",
+                        action='store_true',
                         dest='is_as_only', default=False,
                         help="only perform the allele-specific part (Beta Binomial) "
                         "part of the test")
-    
-    parser.add_argument("-d", "--bnb_only", action='store_true', 
+
+    parser.add_argument("-d", "--bnb_only", action='store_true',
                         dest='is_bnb_only', default=False,
                         help="only perform the association (Beta Negative Binomial) part "
                         "of the test")
-    
-    parser.add_argument("--pc_file", action='store', 
-                        dest='pc_file', 
+
+    parser.add_argument("--pc_file", action='store',
+                        dest='pc_file',
                         help="file containing PC covariates to include in the model"
                         ,default=None)
 
-    parser.add_argument("-b", "--bnb_disp", action='store', dest='bnb_disp', 
+    parser.add_argument("-b", "--bnb_disp", action='store', dest='bnb_disp',
                         help="file containing depth (Beta Negative Binomial)"
                         "dispersion parameters", default=None)
 
-    parser.add_argument("-o", "--as_disp", action='store', 
-                        dest='as_disp', 
+    parser.add_argument("-o", "--as_disp", action='store',
+                        dest='as_disp',
                         help="file containing allele-specific (Beta Binomial) dispersion "
                         "parameters", default=None)
 
-    parser.add_argument("-s", "--shuffle", action='store_true', 
+    parser.add_argument("-s", "--shuffle", action='store_true',
                         dest='shuffle', default=False,
                         help="permute genotypes")
-    
+
     parser.add_argument("-e", "--read_error_rate", action='store', dest='read_error_rate',
                         help="estimate of error rate, used to update "
                         "heterozygous genotype probabilities "
                         "(currently this option disabled / not used)",
                         type=float, default=0.005)
-    
-    parser.add_argument("-m", "--min_as_counts", action='store', dest='min_as_counts', 
+
+    parser.add_argument("-m", "--min_as_counts", action='store', dest='min_as_counts',
                         type=int, default=0,
                         help="only perform test when total number of allele-specific "
                         "read counts across individuals > MIN_COUNTS")
 
-    parser.add_argument("--num_pcs", action='store', dest='num_pcs', 
+    parser.add_argument("--num_pcs", action='store', dest='num_pcs',
                         type=int, default=0,
                         help="designates the number of PCs to use as covariates")
-    
-    parser.add_argument("-v", "--verbose", action='store_true', dest='verbose', 
+
+    parser.add_argument("-v", "--verbose", action='store_true', dest='verbose',
                         default=False, help="print extra information")
 
     parser.add_argument("--benchmark", dest="benchmark",
                         help="write information about time test is takes, number of optimization "
                         "functions, etc. to specified filename, or to stderr if '-' is specified")
-    
+
     parser.add_argument("infile_list", action='store', default=None)
     parser.add_argument("out_file", action='store', default=None)
 
@@ -478,7 +475,7 @@ def parse_options():
 
 
 
-                
+
 def addlogs(loga, logb):
     """Helper function: perform numerically-stable addition in log space"""
     return max(loga, logb) + math.log(1 + math.exp(-abs(loga - logb)))
@@ -487,23 +484,23 @@ def addlogs(loga, logb):
 
 def AS_betabinom_loglike(logps, sigma, AS1, AS2, hetp, error):
     """Given parameter, returns log likelihood of allele-specific
-    part of test. Note that some parts of equation have been 
+    part of test. Note that some parts of equation have been
     canceled out"""
     a = math.exp(logps[0] + math.log(1/sigma**2 - 1))
     b = math.exp(logps[1] + math.log(1/sigma**2 - 1))
-    
+
     part1 = 0
     part1 += betaln(AS1 + a, AS2 + b)
     part1 -= betaln(a, b)
-    
+
     if hetp==1:
-        return part1        
+        return part1
 
     e1 = math.log(error) * AS1 + math.log(1 - error) * AS2
     e2 = math.log(error) * AS2 + math.log(1 - error) * AS1
     if hetp == 0:
         return addlogs(e1, e2)
-    
+
     return addlogs(math.log(hetp)+part1, math.log(1-hetp) + addlogs(e1,e2))
 
 def betaln_asym(a,b):
@@ -554,7 +551,7 @@ def BNB_loglike(k,mean,sigma,n):
 
     #Add log(beta(a+n,b+k))
     loglike += betaln_asym(a+n,b+k)
-    
+
     #Subtract log(beta(a,b))
     loglike -= betaln_asym(a,b)
 
@@ -562,26 +559,26 @@ def BNB_loglike(k,mean,sigma,n):
 
 
 
-def ll_one(x, test_snps, is_bnb_only, is_as_only, bnb_sigmas, 
+def ll_one(x, test_snps, is_bnb_only, is_as_only, bnb_sigmas,
            as_sigmas, error, pc_coefs, pc_matrix):
     alpha = x[0]
     beta = x[0]
     r = x[1]
-    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only, 
+    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
                          is_as_only, bnb_sigmas, as_sigmas, error, pc_coefs, pc_matrix)
 
 
-def ll_pc(x, params, test_snps, is_bnb_only, is_as_only, bnb_sigmas, 
+def ll_pc(x, params, test_snps, is_bnb_only, is_as_only, bnb_sigmas,
           as_sigmas, error, other_pc_coefs, pc_matrix):
     alpha = params[0]
     beta = params[0]
     r = params[1]
     pc_coefs=np.concatenate([other_pc_coefs,x])
-    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only, 
+    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
                          is_as_only, bnb_sigmas, as_sigmas, error, pc_coefs,pc_matrix)
 
 
-def ll_two(x, test_snps, is_bnb_only, is_as_only, bnb_sigmas, 
+def ll_two(x, test_snps, is_bnb_only, is_as_only, bnb_sigmas,
            as_sigmas, error, pc_coefs, pc_matrix):
     alpha = x[0]
     beta = x[1]
@@ -590,7 +587,7 @@ def ll_two(x, test_snps, is_bnb_only, is_as_only, bnb_sigmas,
     #    pc_fits=x[3:]
     #else:
     #    pc_fits=[]
-    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only, 
+    return loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
                          is_as_only, bnb_sigmas, as_sigmas, error, pc_coefs, pc_matrix)
 
 
@@ -600,13 +597,13 @@ def calc_pc_factor(pc_fits, pcs, i):
     else:
         return 1
 
-    
-def loglikelihood(alpha, beta, r, test_snps, is_bnb_only, 
-                  is_as_only, bnb_sigmas, as_sigmas, error, 
-                  pc_coefs, pc_matrix): 
+
+def loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
+                  is_as_only, bnb_sigmas, as_sigmas, error,
+                  pc_coefs, pc_matrix):
     loglike = 0
 
-    # if input values are outside of reasonable range return a 
+    # if input values are outside of reasonable range return a
     # very high -loglike
     if alpha <= 0 or beta <= 0 or r <= 0 or r > 1:
         return 10000000
@@ -623,12 +620,12 @@ def loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
         if m<0:
             m = 0.000001
         if not is_bnb_only:
-            for j in range(len(test_snps[i].AS_target_ref)):            
+            for j in range(len(test_snps[i].AS_target_ref)):
                 if test_snps[i].hetps[j]>.9:
                     hetp = min(0.99, test_snps[i].hetps[j])
                     logps = [math.log(alpha) - math.log(alpha+beta),
                              math.log(beta) - math.log(alpha+beta)]
-                    loglike += AS_betabinom_loglike(logps, as_sigmas[i], 
+                    loglike += AS_betabinom_loglike(logps, as_sigmas[i],
                                                     test_snps[i].AS_target_ref[j],
                                                     test_snps[i].AS_target_alt[j],
                                                     hetp, error)
@@ -638,7 +635,7 @@ def loglikelihood(alpha, beta, r, test_snps, is_bnb_only,
     return -loglike
 
 
-def parse_test_snp(snpinfo, options):
+def parse_test_snp(snpinfo, options):    
     snp_id = snpinfo[2]
     if snpinfo[16] == "NA":
         # SNP is missing data
@@ -656,45 +653,60 @@ def parse_test_snp(snpinfo, options):
     else:
         geno_hap1 = int(snpinfo[6].strip().split("|")[0])
         geno_hap2 = int(snpinfo[6].strip().split("|")[1])
-    
+
     if snpinfo[15] == "NA":
         count = 0
     else:
         count = int(snpinfo[15])
 
-    #if snpinfo[9].strip() == "NA":
-        # SNP is homozygous, so there is no AS info
-    #    return TestSNP(snp_id, geno_hap1, geno_hap2, [], [], [], tot, count)    
     if snpinfo[9].strip() == "NA" or geno_hap1 == geno_hap2:
         # SNP is homozygous, so there is no AS info
-        return TestSNP(snp_id, geno_hap1, geno_hap2, [], [], [], tot, count)    
+        return TestSNP(snp_id, geno_hap1, geno_hap2, [], [], [], tot, count)
     else:
-        # positions of target SNPs (not currently used)
-        snplocs=[int(y.strip()) for y in snpinfo[9].split(';')]
+        # positions of target SNPs
+        snp_locs = np.array([int(y.strip()) for y in snpinfo[9].split(';')])
 
         # counts of reads that match reference overlapping linked 'target' SNPs
-        AS_target_ref = [int(y) for y in snpinfo[12].split(';')]
+        snp_as_ref = np.array([int(y) for y in snpinfo[12].split(';')])
 
         # counts of reads that match alternate allele
-        AS_target_alt = [int(y) for y in snpinfo[13].split(';')]
+        snp_as_alt = np.array([int(y) for y in snpinfo[13].split(';')])
 
         # heterozygote probabilities
-        hetps = [np.float64(y.strip()) for y in snpinfo[10].split(';')]
+        snp_hetps = np.array([np.float64(y.strip())
+                          for y in snpinfo[10].split(';')])
 
         # linkage probabilities, not currently used
-        linkageps = [np.float64(y.strip()) for y in snpinfo[11].split(';')]
+        snp_linkageps = np.array([np.float64(y.strip())
+                                  for y in snpinfo[11].split(';')])
+
+
+        # same SNP should not be provided multiple times, this
+        # can create problems with combined test. Warn and filter
+        # duplicate SNPs
+        uniq_loc, uniq_idx = np.unique(snp_locs, return_index=True)
 
+        if options.dup_snp_warn and uniq_loc.shape[0] != snp_locs.shape[0]:
+            sys.stderr.write("WARNING: discarding SNPs that are repeated "
+                                     "multiple times in same line\n")
+            options.dup_snp_warn = False
+
+        snp_as_ref = snp_as_ref[uniq_idx]
+        snp_as_alt = snp_as_alt[uniq_idx]
+        snp_hetps = snp_hetps[uniq_idx]
+        snp_linkageps = snp_linkageps[uniq_idx]
+                             
         if options.shuffle:
             # permute allele-specific read counts by flipping them randomly at
             # each SNP
-            for y in range(len(AS_target_ref)):
-                if randint(0,1) == 1:
-                    temp=AS_target_ref[y]
-                    AS_target_ref[y] = AS_target_alt[y]
-                    AS_target_alt[y] = temp
+            for y in range(len(snp_as_ref)):
+                if random.randint(0, 1) == 1:
+                    temp = snp_as_ref[y]
+                    snp_as_ref[y] = snp_as_alt[y]
+                    snp_as_alt[y] = temp
 
-        return TestSNP(snp_id, geno_hap1, geno_hap2, AS_target_ref, 
-                       AS_target_alt, hetps, tot, count)
+        return TestSNP(snp_id, geno_hap1, geno_hap2, snp_as_ref,
+                       snp_as_alt, snp_hetps, tot, count)
 
 def load_covariates(cov_file):
     infile=open(cov_file)
@@ -705,7 +717,7 @@ def load_covariates(cov_file):
             cov_table.append([np.float64(x) for x in line.strip().split()])
         else:
             break
-    
+
     return np.array(cov_table, dtype=np.float64)
-        
+
 main()
diff --git a/CHT/extract_haplotype_read_counts.py b/CHT/extract_haplotype_read_counts.py
index 1636a69..fabf90e 100644
--- a/CHT/extract_haplotype_read_counts.py
+++ b/CHT/extract_haplotype_read_counts.py
@@ -277,7 +277,7 @@ def lookup_individual_index(options, ind_name):
     to lookup information in the genotype and haplotype tables"""
     sys.stderr.write("reading list of individuals from %s\n" % 
                      options.samples)
-    f = open(options.samples)
+    f = open(options.samples, "r")
 
     idx = 0
     for line in f:
@@ -530,10 +530,13 @@ def main():
     genomewide_read_counts = get_genomewide_count(data_files.read_count_h5,
                                                   chrom_list)
 
+
+    unknown_chrom = set([])
+    
     if args.input_file.endswith(".gz"):
-        f = gzip.open(args.input_file)
+        f = gzip.open(args.input_file, "rt")
     else:
-        f = open(args.input_file)
+        f = open(args.input_file, "r")
 
     line_count = 0
 
@@ -557,7 +560,22 @@ def main():
             continue
         
         chrom_name = words[0]
-        chrom = chrom_dict[chrom_name]
+        if chrom_name in chrom_dict:
+            chrom = chrom_dict[chrom_name]
+        else:
+            if not chrom_name.startswith("chr"):
+                # try adding 'chr' to front of name
+                new_chrom_name = "chr" + chrom_name
+                if new_chrom_name in chrom_dict:
+                    chrom_name = new_chrom_name
+                    chrom = chrom_dict[chrom_name]
+                else:
+                    # can't figure out this chromosome name
+                    if not chrom_name in unknown_chrom:
+                        unknown_chrom.add(chrom_name)
+                        sys.stderr.write("WARNING: unknown chromosome '%s'")
+                    continue
+                    
         
         region_list = get_target_regions(args, chrom, words)
 
diff --git a/CHT/fit_as_coefficients.py b/CHT/fit_as_coefficients.py
index c0a3350..efa2864 100644
--- a/CHT/fit_as_coefficients.py
+++ b/CHT/fit_as_coefficients.py
@@ -33,21 +33,21 @@ def parse_options():
                                      "each individual (across sites), under the assumption "
                                      "that all sites come from the null hypothesis (no "
                                      "genetic association)");
-    
-    
-    parser.add_argument("-e", action='store', dest='read_error_rate',
-                        help="sequence read error rate",
+
+
+    parser.add_argument("--read_error_rate", "-e", action='store', dest='read_error_rate',
+                        help="sequence read error rate (default=0.005)",
                         type=float, default=0.005)
-    
-    parser.add_argument("infile_list", 
+
+    parser.add_argument("infile_list",
                         help="Path to file containing list of CHT input "
                         "files (one for each individual)",
                         action='store', default=None)
-    
-    parser.add_argument("out_file", 
+
+    parser.add_argument("out_file",
                         help="File to write overdispersion parameter estimates to",
                         action='store', default=None)
-    
+
     return parser.parse_args()
 
 
@@ -58,7 +58,7 @@ def open_input_files(in_filename):
         sys.stderr.write("input file %s does not exist or is not a regular file\n" %
                          in_filename)
         exit(2)
-    
+
     # read file that contains list of input files
     in_file = open(in_filename)
 
@@ -67,20 +67,20 @@ def open_input_files(in_filename):
         # open each input file and read first line
         filename = line.rstrip()
         if not filename or not os.path.exists(filename) or not os.path.isfile(filename):
-            sys.stderr.write("input file '%s' does not exist or is not a regular file\n" 
+            sys.stderr.write("input file '%s' does not exist or is not a regular file\n"
                              % line)
             exit(2)
         if filename.endswith(".gz"):
-            f = gzip.open(filename)
+            f = gzip.open(filename, "rt")
         else:
             f = open(filename)
-            
+
         # skip header
         f.readline()
 
         infiles.append(f)
     in_file.close()
-    
+
     if len(infiles) == 0:
         sys.stderr.write("no input files specified in file '%s'\n" % options.infile_list)
         exit(2)
@@ -93,37 +93,76 @@ def main():
     options = parse_options()
     infiles = open_input_files(options.infile_list)
     outfile = open(options.out_file,"w")
+    dup_snp_warn = True
+    
+    # read input data and estimate dispersion coefficient
+    # for one individual at a time
+    for i in range(len(infiles)):
+        cur_file = infiles[i]
+
+        AS_ref = []
+        AS_alt = []
+        hetps =[]
 
-    for cur_file in infiles:
-        AS_ref=[]
-        AS_alt=[]
-        hetps=[]
-        cur_line=cur_file.readline()
-        while True:
-            try:
-                cur_line=cur_file.readline()
-            except:
-                break
-            if not cur_line:
-                break
-            snpinfo=cur_line.strip().split()
+        header = cur_file.readline()
 
+        # combine allele-specific read counts into one large
+        # array for this individual
+        for line in cur_file:
+            snpinfo = line.strip().split()
+            
             if snpinfo[12] != "NA":
-                AS_ref = AS_ref + [int(y) for y in snpinfo[12].split(';')]
-                AS_alt = AS_alt + [int(y) for y in snpinfo[13].split(';')]
-                hetps = hetps + [float(y.strip()) for y in snpinfo[10].split(';')]
-        dispersion = fmin(likelihood, 0.001, 
-                          args=(AS_ref, AS_alt, hetps, options.read_error_rate))
-        outfile.write(str(dispersion[0])+"\n")
+                # read information aout target SNPs
+                snp_locs = np.array([int(y.strip()) for y in snpinfo[9].split(';')],
+                                    dtype=np.int32)
+                snp_as_ref = np.array([int(y) for y in snpinfo[12].split(';')],
+                                      dtype=np.int32)
+                snp_as_alt = np.array([int(y) for y in snpinfo[13].split(';')],
+                                      dtype=np.int32)
+                snp_hetps = np.array([float(y.strip()) for y in snpinfo[10].split(';')],
+                                     dtype=np.float64)
+
+                # same SNP should not be provided multiple times, this
+                # can create problems with combined test. Warn and filter
+                # duplicate SNPs
+                uniq_loc, uniq_idx = np.unique(snp_locs, return_index=True)
+
+                if dup_snp_warn and uniq_loc.shape[0] != snp_locs.shape[0]:
+                    sys.stderr.write("WARNING: discarding SNPs that are repeated "
+                                     "multiple times in same line\n")
+                    dup_snp_warn = False
+                
+                AS_ref.extend(snp_as_ref[uniq_idx])
+                AS_alt.extend(snp_as_alt[uniq_idx])
+                hetps.extend(snp_hetps[uniq_idx])
+
+        AS_ref = np.array(AS_ref)
+        AS_alt = np.array(AS_alt)
+        hetps = np.array(hetps)
+
+        # find maximu likelihood estimate for overdispersion parameter
+        res = minimize_scalar(likelihood, bounds=(0.01, 0.99),
+                              args=(AS_ref, AS_alt, hetps, options.read_error_rate),
+                              options = {'xatol' : 0.001},
+                              method="Bounded")
+        LL = res.fun
+        dispersion = res.x
+
+        sys.stderr.write("AS dispersion[%d]: %g\n" % (i, dispersion))
+        sys.stderr.write("LL[%d]: -%g\n" % (i, LL))
+
+        outfile.write(str(dispersion)+"\n")
         outfile.flush()
 
-        
-def likelihood(dispersion,AS_ref,AS_alt,hetps,error):
-    cur_like=0
+
+def likelihood(dispersion, AS_ref, AS_alt, hetps, error):
+    cur_like = 0
     for i in range(len(AS_ref)):
-        cur_like = cur_like + AS_betabinom_loglike([math.log(0.5), math.log(0.5)], 
-                                                   dispersion, AS_ref[i], 
-                                                   AS_alt[i], hetps[i], error)
+        # calculate likelihood for each heterozygous site, under
+        # assumption that true reference proportion is 50%
+        cur_like +=  AS_betabinom_loglike([math.log(0.5), math.log(0.5)],
+                                          dispersion, AS_ref[i],
+                                          AS_alt[i], hetps[i], error)
     return -cur_like
 
 
@@ -139,22 +178,22 @@ def addlogs(loga, logb):
 def AS_betabinom_loglike(logps, sigma, AS1, AS2, hetp, error):
     if sigma >= 1.0 or sigma <= 0.0:
         return -99999999999.0
-    else:
-        a = math.exp(logps[0] + math.log(1/sigma**2 - 1))
-        b = math.exp(logps[1] + math.log(1/sigma**2 - 1))
-    
+
+    a = math.exp(logps[0] + math.log(1/sigma**2 - 1))
+    b = math.exp(logps[1] + math.log(1/sigma**2 - 1))
+
     part1 = 0
     part1 += betaln(AS1 + a, AS2 + b)
     part1 -= betaln(a, b)
-    
+
     if hetp == 1:
-        return part1        
+        return part1
 
     e1 = math.log(error) * AS1 + math.log(1 - error) * AS2
     e2 = math.log(error) * AS2 + math.log(1 - error) * AS1
     if hetp == 0:
         return addlogs(e1, e2)
-    
+
     return addlogs(math.log(hetp)+part1, math.log(1-hetp) + addlogs(e1,e2))
 
 main()
diff --git a/CHT/fit_bnb_coefficients.py b/CHT/fit_bnb_coefficients.py
index 9b6abbb..54569a5 100644
--- a/CHT/fit_bnb_coefficients.py
+++ b/CHT/fit_bnb_coefficients.py
@@ -22,80 +22,69 @@
 from scipy.special import gammaln
 from scipy.special import betaln
 import scipy.stats
-
+import time
 import numpy as np
 
+import cht_data
 
-class TestSNP:
-    def __init__(self, name, geno_hap1, geno_hap2, AS_target_ref, AS_target_alt, 
-                 hetps, totals, counts):
-        self.name = name
-        self.geno_hap1 = geno_hap1
-        self.geno_hap2 = geno_hap2
-        self.AS_target_ref = AS_target_ref
-        self.AS_target_alt = AS_target_alt
-        self.hetps = hetps
-        self.totals = totals
-        self.counts = counts
 
-        
-    def is_het(self):
-        """returns True if the test SNP is heterozygous"""        
-        return self.geno_hap1 != self.geno_hap2
+MIN_GENE_FIT = 0.0
+MAX_GENE_FIT = 1e8
 
-    def is_homo_ref(self):
-        """Returns True if test SNP is homozygous for reference allele"""
-        return self.geno_hap1 == 0 and self.geno_hap2 == 0
+MIN_GW_FIT = 1.0
+MAX_GW_FIT = 10e3
 
-    def is_homo_alt(self):
-        """Returns True if test SNP is homozygous for non-reference allele"""
-        return self.geno_hap1 == 1 and self.geno_hap2 == 1
 
+# MIN_GENE_FIT = 0.0
+# MAX_GENE_FIT = 1.0
 
+# MIN_GW_FIT = 1.0
+# MAX_GW_FIT = 1e3
 
-def open_input_files(in_filename):
-    if not os.path.exists(in_filename) or not os.path.isfile(in_filename):
-        sys.stderr.write("input file %s does not exist or is not a regular file\n" %
-                         in_filename)
-        exit(2)
-    
-    # read file that contains list of input files
-    in_file = open(in_filename)
-
-    infiles = []
-    for line in in_file:
-        # open each input file and read first line
-        filename = line.rstrip()
-        sys.stderr.write(filename+"\n")
-        if not filename or not os.path.exists(filename) or not os.path.isfile(filename):
-            sys.stderr.write("input file '%s' does not exist or is not a regular file\n" 
-                             % in_file)
-            exit(2)
-        if filename.endswith(".gz"):
-            f = gzip.open(filename)
-        else:
-            f = open(filename)
-            
-        # skip header
-        f.readline()
 
-        infiles.append(f)
-    in_file.close()
-    
-    if len(infiles) == 0:
-        sys.stderr.write("no input files specified in file '%s'\n" % options.infile_list)
-        exit(2)
+# bounds for parameters
+MIN_MEAN_FIT = MIN_GENE_FIT
+MAX_MEAN_FIT = MAX_GENE_FIT
+
+GW_XTOL = 1e-2
+GENE_XTOL = 1e-7
+MEAN_XTOL = 1e-7
+
 
-    return infiles
 
 
 def parse_options():
-    parser=argparse.ArgumentParser()
-    
+
+    default_sample = 2000
+
+    parser=argparse.ArgumentParser(description="This program estimates the "
+                                   "genome-wide dispersion parameters for "
+                                   "the read depth (beta-negative binomial, BNB) part "
+                                   "of the combined haplotype test. These "
+                                   "parameter estimates are not required "
+                                   "for the allele-specific (beta-binomial) part "
+                                   "of the test."
+                                   "This program uses an iterative "
+                                   "maximum likelihood approach to estimate "
+                                   "parameters. To speed up parameter estimation "
+                                   "a subset of test regions are randomly "
+                                   "selected from the input file. By default %d"
+                                   " regions are selected. At the end of each "
+                                   "iteration, the current parameter estimates "
+                                   "are written to stdout and to the "
+                                   "specified output file. "
+                                   "If you are tired of waiting for the "
+                                   "program to converge, these intermediate "
+                                   "estimates can be used before the program "
+                                   "has finished running." % default_sample)
+
+
+
+
     parser.add_argument("infile_list", action='store', default=None)
-    
+
     parser.add_argument("outfile", action='store', default=None)
-    
+
     parser.add_argument("--min_as_counts", action='store', dest='min_as_counts',
                         type=int, default=0, metavar="MIN_COUNTS",
                         help="only use regions where total allele-specific "
@@ -105,132 +94,132 @@ def parse_options():
                         type=int, default=0, metavar="MIN_COUNTS",
                         help="only use regions where total number of "
                         "read counts across individuals > MIN_COUNTS")
-    
+
     parser.add_argument("--skip", action='store', dest='skip',
                         type=int, default=0,
                         help="skip n test region between each one used for fitting")
 
-    return parser.parse_args()
+    parser.add_argument("--sample", type=int, default=default_sample,
+                        help="Randomly sample a specified number of test "
+                        "regions from the total set to speed up the "
+                        "estimation procedure. This is set to %d "
+                        "by default. To "
+                        "disable sampling, set this to 0 (--sample 0), or use "
+                        "the --no_sample option" % default_sample)
 
+    parser.add_argument("--no_sample", action="store_true", dest="no_sample",
+                        help="Disable random sampling of test regions",
+                        default=False)
 
+    parser.add_argument("--seed", help="Random seed which affects random "
+                        "sampling of test regions. Useful for testing.",
+                        type=int,
+                default=-1)
 
+    parser.add_argument("--fix_gene", type=float, default=-1.0,
+                        help="fix the per-gene dispersion parameter estimates to "
+                        "specified value, and only fit the genome-wide dispersion "
+                        "parameter estimates")
+
+    parser.add_argument("--fix_mean", type=float, default=-1.0,
+                        help="fix the per-gene mean dispersion parameter estimates to "
+                        "specified value, and only fit the genome-wide dispersion "
+                        "parameter estimates")
+
+    options = parser.parse_args()
+
+    if options.no_sample or options.sample < 0:
+        # do not perform sampling
+        options.sample = 0
+
+    return options
 
-def read_counts(options):
-    infiles = open_input_files(options.infile_list)
-    
-    # add first row of each input file to snpinfo list
-    snpinfo = []
-    for f in infiles:
-        f.readline()
-        snpinfo.append(f.readline().strip().split())
-
-    finished = False
-    count_matrix = []
-    expected_matrix = []
-    skip_num = 0
-    while not finished:
-        if skip_num < options.skip:
-            skip_num += 1
-            for i in range(len(infiles)):
-                line = infiles[i].readline().strip()
-                if line:
-                    snpinfo[i] = line.split()
-                else:
-                    # out of lines from at least one file, assume we are finished
-                    finished = True
-            continue
-        skip_num = 0
-        count_line = []
-        expected_line = []
-        # parse test SNP and associated info from input file row
-        num_as = 0
-        for i in range(len(infiles)):
-            new_snp = parse_test_snp(snpinfo[i], options)
-            if new_snp.is_het():
-                num_as += np.sum(new_snp.AS_target_ref) + \
-                  np.sum(new_snp.AS_target_alt)
-
-            count_line.append(new_snp.counts)
-            expected_line.append(new_snp.totals)
-            
-            line =infiles[i].readline().strip()
-            if line:
-                snpinfo[i] = line.split()
-            else:
-                # out of lines from at least one file, assume we are finished
-                finished = True
-            
-        if(sum(count_line) >= options.min_counts and num_as >= options.min_as_counts):
-            count_matrix.append(count_line)
-            expected_matrix.append(expected_line)
-    
-    count_matrix = np.array(count_matrix, dtype=int)
-    expected_matrix = np.array(expected_matrix, dtype=np.float64)
 
-    # sys.stderr.write(str(count_matrix[:10,])+"\n")
-    # sys.stderr.write(str(expected_matrix[:10,])+"\n")
-    # sys.stderr.write(str(expected_matrix.shape))
-    # sys.stderr.write(str(count_matrix.shape))
 
-    return count_matrix, expected_matrix
 
 
 def main():
     options = parse_options()
 
+    # set random seed:
+    if options.seed >= 0:
+        np.random.seed(seed=options.seed)
+
     # read input data
-    count_matrix, expected_matrix = read_counts(options)
+    sys.stderr.write("reading input data\n")
+    count_matrix, expected_matrix = \
+        cht_data.read_count_matrices(options.infile_list,
+                                     shuffle=False,
+                                     skip=options.skip,
+                                     min_counts=options.min_counts,
+                                     min_as_counts=options.min_as_counts)
     
-    old_ll = np.float64(1000000000000000000000000)
-    best_start = -1
-    gene_fit_starts = (0.01, 0.005)
+    gene_fits = [np.float64(0.005)] * count_matrix.shape[0]
+    mean_fits = [np.float64(1)] * count_matrix.shape[0]
+    gw_fits = [np.float64(100)] * count_matrix.shape[1]
 
-    # first interaction to get initial parameter estimates
-    for i in range(len(gene_fit_starts)):
-        sys.stderr.write("iteration 0\n")
-        gene_fits = [np.float64(gene_fit_starts[i])] * count_matrix.shape[0]
-        mean_fits = [np.float64(1)] * count_matrix.shape[0]
-        gw_fits = [np.float64(100)] * count_matrix.shape[1]
-
-        sys.stderr.write("fitting genome-wide overdispersion params\n")
-        gw_fits, fit_ll = get_gw_overdisp(count_matrix, expected_matrix,
-                                          gw_fits, gene_fits, mean_fits)
+    old_ll = np.float64(1000000000000000000000000)
 
-        sys.stderr.write("fitting per-region overdiserpersion params\n")
-        gene_fits, mean_fits, fit_ll = get_gene_overdisp(count_matrix, expected_matrix,
-                                                         gw_fits, gene_fits, mean_fits)
+    fix_gene = options.fix_gene > 0.0
+    fix_mean = options.fix_mean > 0.0
 
-        if fit_ll < old_ll:
-            old_ll = fit_ll
-            best_start = gw_fits
+    if fix_gene:
+        gene_fits = [np.float64(options.fix_gene)] * count_matrix.shape[0]
+    if fix_mean:
+        mean_fits = [np.float64(options.fix_mean)] * count_matrix.shape[0]
 
-    gw_fits = best_start
-    
-    # subsequent interations to refine parameter estimates
-    iteration=1
+    # iteratively search for maximum likelihood parameter estimates
+    iteration = 1
     while True:
         sys.stderr.write("\niteration %d\n" % iteration)
-        
+
         # first fit over dispersion params for each region
         sys.stderr.write("fitting per-region overdispersion params\n")
-        gene_fits, mean_fits, fit_ll = \
-          get_gene_overdisp(count_matrix, expected_matrix,
-                            gw_fits, gene_fits, mean_fits)
+        t1 = time.time()
+        #gene_fits, mean_fits, fit_ll = get_gene_overdisp(count_matrix, expected_matrix,
+        #                                                 gw_fits, gene_fits, mean_fits,
+        #                                                 fix_gene=fix_gene,
+        #                                                 fix_mean=fix_mean)
+
+        gene_fit = gene_fits[0]
+        gene_fit, mean_fits, fit_ll = get_single_param_gene_overdisp(count_matrix, expected_matrix,
+                                                                     gw_fits, gene_fit, mean_fits)
+
+        gene_fits[:] = [gene_fit] * len(gene_fits)
+
+
+        time_taken = time.time() - t1
+        sys.stderr.write("time: %.2fs\n" % time_taken)
+
+        sys.stderr.write("min(gene_fits): %g\n" % np.min(gene_fits))
+        sys.stderr.write("max(gene_fits): %g\n" % np.max(gene_fits))
+        sys.stderr.write("mean(gene_fits): %g\n" % np.mean(gene_fits))
+
+        sys.stderr.write("min(mean_fits): %g\n" % np.min(mean_fits))
+        sys.stderr.write("max(mean_fits): %g\n" % np.max(mean_fits))
+        sys.stderr.write("mean(mean_fits): %g\n" % np.mean(mean_fits))
+
 
         # then fit genome-wide overdispersion params for each individual
         sys.stderr.write("fitting genome-wide overdispersion params\n")
+        t1 = time.time()
         gw_fits, fit_ll = get_gw_overdisp(count_matrix, expected_matrix,
                                           gw_fits, gene_fits, mean_fits)
-        
+        time_taken = time.time() - t1
+        sys.stderr.write("time: %.2fs\n" % time_taken)
         sys.stderr.write("LL=-%f\n" % fit_ll)
+        gw_str = " ".join("%.2f" % x for x in gw_fits)
+        sys.stderr.write("current genome-wide overdispersion param estimates:\n"
+                         "  %s\n" % gw_str)
 
-        # write estimates to outfile each iteration (this way outfile is written even if 
-        # optimization terminated by user)
+
+        # write estimates to outfile each iteration
+        # (this way outfile is written even if optimization terminated by user)
         outfile = open(options.outfile, "w")
         for i in gw_fits:
             outfile.write("%f\n" % i)
         outfile.close()
-        
+
         iteration += 1
 
         if old_ll - fit_ll < 0.0001:
@@ -238,71 +227,191 @@ def main():
             break
         old_ll = fit_ll
 
+    sys.stderr.write("done!\n")
+
+
+
 
 
-        
 def get_gene_overdisp(count_matrix, expected_matrix,
-                      gw_fits, gene_fits, mean_fits, iteration=0):
+                      gw_fits, gene_fits, mean_fits, iteration=0,
+                      fix_gene=False, fix_mean=False):
     fit_ll = 0
+    gene_fit_improved = 0
+    mean_fit_improved = 0
+
     for gene_indx in range(count_matrix.shape[0]):
-        mean_fits[gene_indx] = fmin(mean_like, mean_fits[gene_indx], 
-                                    args=(count_matrix[gene_indx,:],
-                                         expected_matrix[gene_indx,:],
-                                         gw_fits, gene_fits[gene_indx]),
-                                         disp=False, maxfun=5000, xtol=1e-6)[0]
-        
-        starts = [np.float64(gene_fits[gene_indx]),
-                  np.float64(0.05), np.float64(0.001)]
-        
-        best_par = np.float64(gene_fits[gene_indx])
-        
-        best_like = gene_like(gene_fits[gene_indx], count_matrix[gene_indx,:],
-                              expected_matrix[gene_indx,:], gw_fits, mean_fits[gene_indx])
-        
-        for start in starts:
-            cur_par = fmin(gene_like, [start], 
-                           args=(count_matrix[gene_indx,:],
+
+        if not fix_mean:
+            cur_like = mean_like(mean_fits[gene_indx],
+                                 count_matrix[gene_indx,:],
                                  expected_matrix[gene_indx,:],
-                                 gw_fits,mean_fits[gene_indx]),
-                                 disp=False,maxfun=5000,xtol=1e-6)[0]
-            
-            cur_like = gene_like(cur_par, count_matrix[gene_indx,:],
+                                 gw_fits, gene_fits[gene_indx])
+
+            xtol = min(mean_fits[gene_indx] * 1e-4, MEAN_XTOL)
+
+            res = minimize_scalar(mean_like,
+                                  bounds=(MIN_MEAN_FIT, MAX_MEAN_FIT),
+                                  args=(count_matrix[gene_indx,:],
+                                        expected_matrix[gene_indx,:],
+                                        gw_fits, gene_fits[gene_indx]),
+                                  options={"xatol" : xtol},
+                                  method="Bounded")
+
+            like_diff = cur_like - res.fun
+            if like_diff >= 0.0:
+                # update parameter
+                mean_fits[gene_indx] = res.x
+                mean_fit_improved += 1
+            else:
+                # likelihood got worse indicating failed to converge,
+                # do not accept new param value
+                pass
+
+        if not fix_gene:
+            cur_like = gene_like(gene_fits[gene_indx], count_matrix[gene_indx,:],
                                  expected_matrix[gene_indx,:], gw_fits,
                                  mean_fits[gene_indx])
-            
-            if cur_like < best_like:
-                best_par = cur_par
-                
-        gene_fits[gene_indx] = best_par
-            
+
+            xtol = min(gene_fits[gene_indx] * 1e-4, GENE_XTOL)
+
+            res = minimize_scalar(gene_like,
+                                  bounds=(MIN_GENE_FIT, MAX_GENE_FIT),
+                                  args=(count_matrix[gene_indx,:],
+                                        expected_matrix[gene_indx,:],
+                                        gw_fits, mean_fits[gene_indx]),
+                                  options={"xatol" : xtol},
+                                  method="Bounded")
+
+            like_diff = cur_like - res.fun
+            if like_diff >= 0.0:
+                # update parameter
+                gene_fits[gene_indx] = res.x
+                gene_fit_improved += 1
+            else:
+                # likelihood got worse indicating failed to converge,
+                # do not accept new param value
+                pass
+
+
         fit_ll += gene_like(gene_fits[gene_indx],
                             count_matrix[gene_indx,:],
                             expected_matrix[gene_indx,:],
                             gw_fits, mean_fits[gene_indx])
-        
+
     return gene_fits, mean_fits, fit_ll
 
 
 
+
+def get_single_param_gene_overdisp(count_matrix, expected_matrix,
+                                   gw_fits, gene_fit, mean_fits, iteration=0,
+                                   fix_gene=False, fix_mean=False):
+    fit_ll = 0
+    gene_fit_improved = 0
+    mean_fit_improved = 0
+
+    # update mean fits
+    for gene_indx in range(count_matrix.shape[0]):
+
+        if not fix_mean:
+            cur_like = mean_like(mean_fits[gene_indx],
+                                 count_matrix[gene_indx,:],
+                                 expected_matrix[gene_indx,:],
+                                 gw_fits, gene_fit)
+
+            xtol = min(mean_fits[gene_indx] * 1e-4, MEAN_XTOL)
+
+            res = minimize_scalar(mean_like,
+                                  bounds=(MIN_MEAN_FIT, MAX_MEAN_FIT),
+                                  args=(count_matrix[gene_indx,:],
+                                        expected_matrix[gene_indx,:],
+                                        gw_fits, gene_fit),
+                                  options={"xatol" : xtol},
+                                  method="Bounded")
+
+            like_diff = cur_like - res.fun
+            if like_diff >= 0.0:
+                # update parameter
+                mean_fits[gene_indx] = res.x
+                mean_fit_improved += 1
+            else:
+                # likelihood got worse indicating failed to converge,
+                # do not accept new param value
+                pass
+
+    cur_like = single_param_gene_like(gene_fit, count_matrix,
+                                      expected_matrix, gw_fits,
+                                      mean_fits)
+
+    xtol = min(gene_fit * 1e-4, GENE_XTOL)
+
+    res = minimize_scalar(single_param_gene_like,
+                          bounds=(MIN_GENE_FIT, MAX_GENE_FIT),
+                          args=(count_matrix,
+                                expected_matrix,
+                                gw_fits, mean_fits),
+                          options={"xatol" : xtol},
+                          method="Bounded")
+
+    like_diff = cur_like - res.fun
+
+    if like_diff >= 0.0:
+        # update parameter
+        gene_fit = res.x
+        fit_ll = res.fun
+    else:
+        # likelihood got worse indicating failed to converge,
+        # do not accept new param value
+        fit_ll = cur_like
+
+    return gene_fit, mean_fits, fit_ll
+
+
+
+
 def get_gw_overdisp(count_matrix, expected_matrix, gw_fits,
                     gene_fits, mean_fits):
     fit_ll = 0
+
     for indx in range(count_matrix.shape[1]):
-        gw_fits[indx] = fmin(gw_like,[gw_fits[indx]], 
-                             args=(count_matrix[:,indx],
-                                   expected_matrix[:,indx],
-                                   gene_fits, mean_fits),
-                                   disp=False, maxfun=50000, xtol=1e-6)[0]
-        
+        cur_like = gw_like(gw_fits[indx], count_matrix[:,indx],
+                           expected_matrix[:,indx], gene_fits, mean_fits)
+
+        init_gw_fit = gw_fits[indx]
+
+        xtol = min(gw_fits[indx] * 1e-2, GW_XTOL)
+
+        res = minimize_scalar(gw_like,
+                              bounds=(MIN_GW_FIT, MAX_GW_FIT),
+                              args=(count_matrix[:,indx],
+                                    expected_matrix[:,indx],
+                                    gene_fits, mean_fits),
+                              options={'xatol' : xtol},
+                              # tol=GW_LIKE_TOL,
+                              method="Bounded")
+
+        new_like = res.fun
+        like_diff = cur_like - res.fun
+
+        if like_diff >= 0.0:
+            # likelhood improved
+            gw_fits[indx] = res.x
+            like = new_like
+        else:
+            # likelihood did not improve because failed to converge
+            like = cur_like
+
         like = gw_like(gw_fits[indx], count_matrix[:,indx],
                        expected_matrix[:,indx], gene_fits, mean_fits)
+
         fit_ll += like
     return gw_fits, fit_ll
 
 
 
 def mean_like(mean_fit, counts, expecteds, gw_fits, gene_fit):
-    if mean_fit < 0:
+    if mean_fit < MIN_MEAN_FIT or mean_fit > MAX_MEAN_FIT:
         return 1e8
     loglike=0
     for i in range(len(counts)):
@@ -313,7 +422,7 @@ def mean_like(mean_fit, counts, expecteds, gw_fits, gene_fit):
 
 
 def gene_like(gene_fit, counts, expecteds, gw_fits, mean_fit):
-    if gene_fit < 0:
+    if gene_fit < MIN_GENE_FIT or gene_fit > MAX_GENE_FIT:
         return 1e8
     loglike=0
     for i in range(len(counts)):
@@ -322,8 +431,21 @@ def gene_like(gene_fit, counts, expecteds, gw_fits, mean_fit):
     return -loglike
 
 
+def single_param_gene_like(gene_fit, count_matrix, expected_matrix, gw_fits, mean_fits):
+    ll = 0
+
+    for gene_indx in range(count_matrix.shape[0]):
+        ll += gene_like(gene_fit,
+                        count_matrix[gene_indx,:],
+                        expected_matrix[gene_indx,:],
+                        gw_fits, mean_fits[gene_indx])
+
+    return ll
+
+
+
 def gw_like(gw_fit, counts, expecteds, gene_fits, mean_fits):
-    if gw_fit >1000000 or gw_fit < 1:
+    if gw_fit > MAX_GW_FIT or gw_fit < MIN_GW_FIT:
         return 1e8
     loglike = 0
     for i in range(len(counts)):
@@ -333,6 +455,8 @@ def gw_like(gw_fit, counts, expecteds, gene_fits, mean_fits):
 
 
 
+
+
 def addlogs(loga, logb):
     """Helper function: perform numerically-stable addition in log space"""
     return max(loga, logb) + math.log(1 + math.exp(-abs(loga - logb)))
@@ -342,7 +466,7 @@ def addlogs(loga, logb):
 def lbeta_asymp(a,b):
     if b > a:
         a,b = b,a
-    
+
     if a<1e6:
         return betaln(a,b)
 
@@ -352,7 +476,7 @@ def lbeta_asymp(a,b):
     l += b*(1-b)/(2*a)
     l += b*(1-b)*(1-2*b)/(12*a*a)
     l += -((b*(1-b))**2)/(12*a**3)
-    
+
     return l
 
 
@@ -373,9 +497,9 @@ def BNB_loglike(k,mean,n,sigma):
 
     a = p * sigma + 1
     b = (1-p) * sigma
-    
+
     loglike = 0
-    
+
     #Rising Pochhammer = gamma(k+n)/gamma(n)
     #for j in range(k):
     #    loglike += math.log(j+n)
@@ -384,55 +508,15 @@ def BNB_loglike(k,mean,n,sigma):
         #loglike=scipy.special.gammaln(k+n)-scipy.special.gammaln(n)
     else:
         loglike=0
-    
+
     #Add log(beta(a+n,b+k))
     loglike += lbeta_asymp(a+n,b+k)
-    
+
     #Subtract log(beta(a,b))
     loglike -= lbeta_asymp(a,b)
 
     return loglike
 
-def parse_test_snp(snpinfo, options):
-    snp_id = snpinfo[2]
-    if snpinfo[16] == "NA":
-        # SNP is missing data
-        tot = 0
-    else:
-        tot = np.float64(snpinfo[16])
-
-    if snpinfo[6] == "NA":
-        geno_hap1 = 0
-        geno_hap2 = 0
-    else:
-        geno_hap1 = int(snpinfo[6].strip().split("|")[0])
-        geno_hap2 = int(snpinfo[6].strip().split("|")[1])
-    
-    if snpinfo[15] == "NA":
-        count = 0
-    else:
-        count = int(snpinfo[15])
-
-    if snpinfo[9].strip() == "NA" or geno_hap1 == geno_hap2:
-        # SNP is homozygous, so there is no AS info
-        return TestSNP(snp_id, geno_hap1, geno_hap2, [], [], [], tot, count)    
-    else:
-        # positions of target SNPs (not currently used)
-        snplocs=[int(y.strip()) for y in snpinfo[9].split(';')]
-
-        # counts of reads that match reference overlapping linked 'target' SNPs
-        AS_target_ref = [int(y) for y in snpinfo[12].split(';')]
-
-        # counts of reads that match alternate allele
-        AS_target_alt = [int(y) for y in snpinfo[13].split(';')]
-
-        # heterozygote probabilities
-        hetps = [np.float64(y.strip()) for y in snpinfo[10].split(';')]
-
-        # linkage probabilities, not currently used
-        linkageps = [np.float64(y.strip()) for y in snpinfo[11].split(';')]
 
-        return TestSNP(snp_id, geno_hap1, geno_hap2, AS_target_ref, 
-                       AS_target_alt, hetps, tot, count)
 
 main()
diff --git a/CHT/qqplot.R b/CHT/qqplot.R
new file mode 100644
index 0000000..0581d1c
--- /dev/null
+++ b/CHT/qqplot.R
@@ -0,0 +1,60 @@
+
+# read output filename and  list of input filenames containing 
+# CHT results from command line
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) < 2) {
+  usage <- "Usage:\n RScript --vanilla OUTPUT.png CHT_OUTPUT_1.txt [... CHT_OUTPUT_N.txt]"
+  stop(paste("At least two arguments must be supplied.\n", usage), call.=FALSE)
+}
+
+png.filename <- args[1]
+input.filenames <- args[2:length(args)]
+
+# check that output filename looks like a PNG file
+if(length(grep(".png$", png.filename)) == 0) {
+  stop("expected output filename to end with .png", call.=FALSE)
+}
+
+# choose set of colors
+library(RColorBrewer)
+pal <- brewer.pal(9, "Set1")
+
+png(png.filename, width=500, height=500)
+
+labels <- c()
+
+min.p <- 1e-20
+
+for(i in 1:length(input.filenames)) {
+    filename <- input.filenames[i]
+
+    tab <- read.table(filename, header=T)
+    n.test <- nrow(tab)
+    null.p <- (1:n.test)/(n.test)
+    obs.p <- tab$P.VALUE
+
+    # cap p-values at min.p for drawing purposes
+    obs.p[obs.p < min.p]  <- min.p
+    null.p[null.p < min.p] <- min.p
+
+    vals <- qqplot(-log10(null.p), -log10(obs.p), plot.it=F)
+
+    # make a legend label from filename, stripping off extension and leading directories
+    s <- unlist(strsplit(filename, "/"))
+    lab <- unlist(strsplit(s[length(s)], "[.]"))[1]
+    labels[i] <- lab
+
+    if(i == 1) {
+        plot(vals$x, vals$y, col=pal[i], las=1,
+             xlab="null -log10 p-values",
+             ylab="observed -log10 p-values")
+        abline(a=0, b=1)
+    } else {
+        points(vals$x, vals$y, col=pal[i])
+    }
+}
+
+legend("topleft", legend=labels, pch=20, col=pal[1:length(labels)])
+
+dev.off()
+
diff --git a/CHT/run_snakemake.sh b/CHT/run_snakemake.sh
new file mode 100755
index 0000000..2cce9e5
--- /dev/null
+++ b/CHT/run_snakemake.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Run snakemake on an SGE cluster (that uses qsub command for submission)
+# Run at most 30 jobs at a time
+# To run on an LSF cluster, change "qsub -V" to "bsub"
+snakemake --cluster "qsub -V" --jobs 30
diff --git a/CHT/simulate_counts.py b/CHT/simulate_counts.py
new file mode 100644
index 0000000..82d936a
--- /dev/null
+++ b/CHT/simulate_counts.py
@@ -0,0 +1,361 @@
+from random import choice, sample, random
+from numpy.random import beta, negative_binomial, binomial
+import numpy as np
+import sys
+
+import argparse
+
+
+def write_options(f, options):
+    f.write("# prefix = %s\n" % options.prefix)
+    f.write("# num_tests = %d\n" % options.num_tests)
+    f.write("# num_inds = %d\n" % options.num_inds)
+    f.write("# min_hets = %d\n" % options.min_hets)
+    f.write("# maf = %g\n" % options.maf)
+    f.write("# mean_counts = %g\n" % options.mean_counts)
+    f.write("# mean_counts_distr = %s\n" % options.mean_counts_distr)
+    f.write("# as_counts = %g\n" % options.as_counts)
+    f.write("# gene_disp = %g\n" % options.gene_disp)
+    f.write("# gene_disp_distr = %s\n" % options.gene_disp_distr)
+    f.write("# ind_disp = %s\n" % ",".join(["%g" % x for x in options.ind_disp]))
+    f.write("# as_disp = %g\n" % options.as_disp)
+    f.write("# effect_size = %g\n" % options.effect_size)
+    f.write("# additivity = %g\n" % options.additivity)
+    f.write("# het_error_rate = %g\n" % options.het_error_rate)
+    f.write("# read_error_rate = %g\n" % options.read_error_rate)
+    f.write("# true_positives = %g\n" % options.true_positives)
+    f.write("# sim_hom_as = %s\n" % options.sim_hom_as)
+
+    
+def write_header(f):
+    f.write("CHROM "
+            "TEST.SNP.POS "
+            "TEST.SNP.ID "
+            "TEST.SNP.REF.ALLELE "
+            "TEST.SNP.ALT.ALLELE "
+            "TEST.SNP.GENOTYPE "
+            "TEST.SNP.HAPLOTYPE "
+            "REGION.START "
+            "REGION.END "
+            "REGION.SNP.POS "
+            "REGION.SNP.HET.PROB "
+            "REGION.SNP.LINKAGE.PROB "
+            "REGION.SNP.REF.HAP.COUNT "
+            "REGION.SNP.ALT.HAP.COUNT "
+            "REGION.SNP.OTHER.HAP.COUNT "
+            "REGION.READ.COUNT "
+            "GENOMEWIDE.READ.COUNT\n")
+
+
+
+def parse_options():
+    parser = argparse.ArgumentParser(description="simulate counts for "
+                                     "combined haplotype test")
+
+    parser.add_argument("--prefix", default=None, required=True,
+                        help="prefix for output files")
+
+    dflt_num_tests = 10000
+    parser.add_argument("--num_tests", default=dflt_num_tests, type=int,
+                        help="number of regions to simulate "
+                        "(default=%d)" % dflt_num_tests)
+
+    dflt_num_inds = 10
+    parser.add_argument("--num_inds", default=dflt_num_inds, type=int,
+                        help="number of individuals to simulate "
+                        "(default=%d)" % dflt_num_inds)
+
+    dflt_min_hets = 2
+    parser.add_argument("--min_hets", default=dflt_min_hets, type=int,
+                        help="minimum number of heterozygous individuals "
+                        "per test SNP (default=%d)" % dflt_min_hets)
+
+    dflt_maf = 0.2
+    parser.add_argument("--maf", default=dflt_maf, type=float,
+                        help="minor allele frequency of test SNP "
+                        "(default=%.2f)" % dflt_maf)
+
+    dflt_mean_counts = 200.0
+    parser.add_argument("--mean_counts", default=dflt_mean_counts, type=float,
+                        help="mean number of read counts per region "
+                        "(default=%.2f)" % dflt_mean_counts)
+
+    dflt_mean_counts_distr = "POINT"
+    parser.add_argument("--mean_counts_distr", default=dflt_mean_counts_distr,
+                        help="distribution for mean number of "
+                        "read counts per region (default=%s). "
+                        " If EXPONENTIAL is specified, the value of "
+                        "mean_counts is used as the scale parameter "
+                        "(mean) of the distribution" % dflt_mean_counts_distr,
+                        choices=("POINT", "EXPONENTIAL"))
+
+    dflt_as_counts = 20.0
+    parser.add_argument("--as_counts", default=dflt_as_counts, type=float,
+                        help="expected number of allele-specific read counts "
+                        "per regions (default=%.2f)" % dflt_as_counts)
+
+    dflt_gene_disp = 0.01
+    parser.add_argument("--gene_disp", default=dflt_gene_disp, type=float,
+                        help="per-gene overdispersion parameter for "
+                        "beta-negative binomial (default=%.2f)" %
+                        dflt_gene_disp)
+
+    dflt_gene_disp_distr = "POINT"
+    parser.add_argument("--gene_disp_distr", default=dflt_gene_disp_distr,
+                        help="distribution for sampling per-gene "
+                        "overdispersion from (default=%s). If EXPONENTIAL "
+                        "is specified, the value of gene_disp is used as "
+                        "the scale parameter (mean) of the distribution"
+                        % dflt_gene_disp, choices=("POINT", "EXPONENTIAL"))
+
+    dflt_ind_disp = "100.0"
+    parser.add_argument("--ind_disp", default=dflt_ind_disp,
+                        help="per individual overdispersion parameter(s) for "
+                        "beta-negative binomial. Can either provide a single value "
+                        "that is used for all individuals or a comma-delimited list "
+                        "of values (one per individual) (default=%s)." % dflt_ind_disp)
+    
+    dflt_as_disp = 0.2
+    parser.add_argument("--as_disp", default=dflt_as_disp, type=float,
+                        help="per individual allele-specific overdispersion "
+                        "parameter for beta-binomial (default=%.2f)" % dflt_as_disp)
+
+    dflt_effect_size = 0.2
+    parser.add_argument("--effect_size", default=dflt_effect_size, type=float,
+                        help="effect size of true positives (default=%.2f)" %
+                        dflt_effect_size)
+
+    dflt_additivity = 1.0
+    parser.add_argument("--additivity", default=dflt_additivity, type=float,
+                        help="additivity of alleles (default=%.2f)" %
+                        dflt_additivity)
+
+    dflt_het_error_rate = 0.01
+    parser.add_argument("--het_error_rate", default=dflt_het_error_rate,
+                        type=float, help="rate of incorrect heterozygous genotype calls"
+                        "(default=%.2f)" % dflt_het_error_rate)
+    
+    dflt_read_error_rate = 0.01
+    parser.add_argument("--read_error_rate", default=dflt_read_error_rate,
+                        type=float, help="rate of incorrect alleles in reads"
+                        "(default=%.2f)" % dflt_read_error_rate)
+
+    dflt_true_positives = 0.05
+    parser.add_argument("--true_positives", default=dflt_true_positives,
+                        type=float, help="fraction of test SNPs that are "
+                        "true positives (default=%.2f)" % dflt_true_positives)
+
+    dflt_sim_hom_as = False
+    parser.add_argument("--sim_hom_as", action="store_true", dest="sim_hom_as",
+                        help="simulate allele specific counts "
+                        "at homozygous test SNPs (default=False)", default=False)
+
+    options = parser.parse_args()
+
+    # split apart individual dispersion value string
+    vals = [float(x) for x in options.ind_disp.split(",")]
+    if len(vals) == 1:
+        options.ind_disp = [vals[0]] * options.num_inds
+    elif len(vals) != options.num_inds:
+        raise ValueError("number of ind_disp values should be "
+                         "equal to 1 or to num_ind (%d)\n" % options.num_inds)
+    else:
+        options.ind_disp = vals
+        
+    return options
+    
+
+
+
+def main():
+    options = parse_options()
+    
+    out_files = []
+    sys.stderr.write("creating output files:\n")
+    file_list = open("%s_file_list.txt" % options.prefix, "w")
+    for i in range(options.num_inds):
+        out_filename = "%s_%d.txt" % (options.prefix, i+1)
+        sys.stderr.write("  %s\n" % out_filename)
+
+        out_files.append(open(out_filename, "w"))
+        # write_options(out_files[i], options)
+        write_header(out_files[i])
+        file_list.write(out_filename + "\n")
+    file_list.close()
+    
+    
+    ASseq_Y_file = open("%s_Y.txt" % options.prefix, "w")
+    ASseq_Y1_file = open("%s_Y1.txt" % options.prefix, "w")
+    ASseq_Y2_file = open("%s_Y2.txt" % options.prefix, "w")
+    ASseq_Z_file = open("%s_Z.txt" % options.prefix, "w")
+
+    test = 1
+    while test <= options.num_tests:
+        if random() > options.true_positives:
+            # simulate a site with no effect, this is not a positive
+            effect = 0
+            alt_expr = 1
+            AS_frac = 0.5
+        elif random() < 0.5:
+            # simulate a site with effect and beta > alpha
+            effect = 0
+            alt_expr = 1.0 + options.effect_size
+            AS_frac = 1.0 / (2.0 + options.additivity * options.effect_size)
+        else:
+            # simulate a site with effect and alpha > beta
+            effect = 1
+            alt_expr = 1 / (1 + options.effect_size)
+            AS_frac = (1 + options.additivity * options.effect_size) / \
+                (2.0 + options.additivity * options.effect_size)
+
+        snps = []
+        counts = []
+        num_hets = 0
+
+        
+        if options.mean_counts_distr == "POINT":
+            mean_counts = options.mean_counts
+        elif options.mean_counts_distr == "EXPONENTIAL":
+            mean_counts = np.random.exponential(options.mean_counts)
+        else:
+            raise ValueError("unknown distribution %s\n" %
+                             options.mean_counts_distr)
+
+        if options.gene_disp_distr == "POINT":
+            gene_disp = options.gene_disp
+        elif options.gene_disp_distr == "EXPONENTIAL":
+            gene_disp = np.random.exponential(options.gene_disp)
+            sys.stderr.write("gene_disp: %.2f\n" % gene_disp)
+        else:
+            sys.stderr.write("unknown distribution: %s\n" % gene_disp)
+
+        
+        for ind in range(options.num_inds):
+            # Simulate the individual's haps=[0,0]
+            # prob of each minor allele is MAF (minor allele freq)
+            is_het = False
+
+            n_minor = int(random() < options.maf) + int(random() < options.maf)
+            if n_minor == 0:
+                # no minor alleles
+                haps = [0,0]
+            elif n_minor == 1:
+                # heterozygous
+                haps = [0,1]
+                num_hets += 1
+                is_het = True
+            else:
+                # two minor alleles
+                haps = [1,1]
+            
+            # Expected number of reads based on genotypes
+            ind_mean_counts = mean_counts * ((2 - n_minor) + (n_minor * alt_expr))
+            #sys.stderr.write("n_minor: %d alt_expr: %g mean_counts: %g " %
+            #                 (n_minor, alt_expr, ind_mean_counts))
+            
+            sim_count = simulate_BNB(ind_mean_counts, gene_disp, options.ind_disp[ind])
+
+            if is_het:
+                if random() < options.het_error_rate:
+                    # simulate a homozygous site that was miscalled
+                    # as a heterozygote
+                    if haps[0] == 0:
+                        ref, alt = simulate_BB(options.as_counts,
+                                               options.read_error_rate, options.as_disp)
+                    else:
+                        ref, alt = simulate_BB(options.as_counts, 1-options.read_error_rate,
+                                               options.as_disp)
+                else:
+                    ref, alt = simulate_BB(options.as_counts, AS_frac,
+                                           options.as_disp)
+            else:
+                if options.sim_hom_as:
+                    # simulate allele-specific counts even when test SNP
+                    # is homozygous
+                    ref, alt = simulate_BB(options.as_counts, 0.5, AS_disp)
+                else:
+                    ref, alt = 0, 0
+            snps.append(TestSNP(effect, test, haps, sim_count,
+                                ref, alt, 1.0 - options.het_error_rate))
+            counts.append(sim_count)
+
+        mean_counts = np.mean(counts)
+        Y=[]
+        Y1=[]
+        Y2=[]
+        Z=[]
+        
+        if num_hets >= options.min_hets:
+            for snp_indx in range(len(snps)):
+                snps[snp_indx].set_total_counts(mean_counts)
+                out_files[snp_indx].write(snps[snp_indx].print_snp())
+                out_files[snp_indx].flush()
+                
+                Y.append(snps[snp_indx].count)
+                Y1.append(snps[snp_indx].ref_count)
+                Y2.append(snps[snp_indx].alt_count)
+            
+                if(snps[snp_indx].haps[0]==0 and snps[snp_indx].haps[1]==0):
+                    Z.append(0)
+                elif(snps[snp_indx].haps[0]==0 and snps[snp_indx].haps[1]==1):
+                    Z.append(1)
+                elif(snps[snp_indx].haps[0]==1 and snps[snp_indx].haps[1]==0):
+                    Z.append(1)
+                elif(snps[snp_indx].haps[0]==1 and snps[snp_indx].haps[1]==1):
+                    Z.append(4)
+                    
+            ASseq_Y_file.write("\t".join(str(y) for y in Y)+"\n")
+            ASseq_Y1_file.write("\t".join(str(y1) for y1 in Y1)+"\n")
+            ASseq_Y2_file.write("\t".join(str(y2) for y2 in Y2)+"\n")    
+            ASseq_Z_file.write("\t".join(str(z) for z in Z)+"\n")
+            test+=1
+
+class TestSNP:
+    def __init__(self,effect,test_num,haps,count,as_ref,as_alt,hetp):
+        self.chrm = effect
+        self.pos = test_num
+        self.ref_allele = "A"
+        self.alt_allele = "T"
+        self.count = count
+        self.ref_count = as_ref
+        self.alt_count = as_alt
+        self.haps = haps
+        if haps[0] != haps[1]:
+            self.hetp=hetp
+        else:
+            self.hetp = 0
+        self.genotype = sum(haps)
+        self.tot_count = 0
+
+    def set_total_counts(self, tot_count):
+        self.tot_count = tot_count
+
+    def print_snp(self):
+        return("%i %i %i %s %s %i %i|%i %i %i %i %f %i %i %i %i %i %i\n" %
+               (self.chrm, self.pos, self.pos, self.ref_allele, self.alt_allele,
+                self.genotype, self.haps[0], self.haps[1], self.pos, self.pos+1,
+                self.pos, self.hetp, 1, self.ref_count, self.alt_count, 0,
+                self.count, self.tot_count))
+
+def simulate_BNB(mean, sigma, n):
+    # sys.stderr.write("%g %g %g\n" % (mean, sigma, n))
+    mean_p = np.float64(n) / (n+mean)
+    sigma = (1 / sigma)**2
+    a = mean_p * (sigma)+1
+    b = (1 - mean_p)*sigma
+    
+    p = beta(a, b)
+    #sys.stderr.write("%f %f\n"%(n,p))
+    counts = negative_binomial(n, p)
+    return counts
+
+def simulate_BB(tot, mean_p, sigma):
+    a = mean_p * (1/sigma**2 - 1)
+    b = (1-mean_p) * (1/sigma**2 - 1)
+
+    p = beta(a,b)
+    counts = binomial(tot,p)
+    #sys.stderr.write("%f %f %i\n"%(mean_p,p,counts))
+    return counts, (tot-counts)
+
+main()
diff --git a/CHT/snake_conf.yaml b/CHT/snake_conf.yaml
new file mode 100644
index 0000000..9164157
--- /dev/null
+++ b/CHT/snake_conf.yaml
@@ -0,0 +1,51 @@
+#
+# This is the Snakemake configuration file that specifies paths and 
+# and options for the Combined Haplotype Pipeline
+#
+
+{
+    # py2 is a bit of a hack so that python2 scripts can be called
+    # by snakemake (which is written in python3). The value should
+    # do whatever is needed to setup a python2 environment and
+    # call the python2 interpreter. Depending on your system configuration,
+    # this might involve setting the PATH environment variable or
+    # just calling python2
+    "py2" : "PATH=$HOME/anaconda2/bin:$PATH; python ",
+
+    # Rscript should set environment as appropriate and call Rscript interpreter
+    "Rscript" : "PATH=$HOME/anaconda2/bin:$PATH; Rscript ",
+
+    # WASP directory
+    "wasp_dir" : "/iblm/netapp/home/gmcvicker/proj/WASP",
+
+
+    # directory to write all output data files to
+    "base_dir" :  "/iblm/netapp/home/gmcvicker/proj/WASP/CHT/output",
+
+    # minimum number of allele-specific reads (combined across individuals)
+    # required to perform combined test on a region
+    "min_as_count" : "10",
+
+    # location of chromInfo file containing chromosome names and lengths
+    # (can be downloaded from UCSC genome browser)
+    "chrom_info" : "/iblm/netapp/home/gmcvicker/proj/WASP/examples/example_data/chromInfo.hg19.txt",
+
+    # directory containing impute or VCF SNP files
+    "snp_dir" : "/iblm/netapp/home/gmcvicker/proj/WASP/examples/example_data/genotypes",
+    # file containing sample identifiers for impute or VCF SNP file
+    "snp_samples" : "/iblm/netapp/home/gmcvicker/proj/WASP/examples/example_data/genotypes/YRI_samples.txt",
+
+    # directory containing bowtie2 index
+    # one file per chromosome
+    "fasta_dir" : "/iblm/netapp/data1/external/GRC37/chroms",
+
+    # text file containing sample identifiers for the subset of
+    # samples that are used in study (one per line)
+    "samples_file" : "/iblm/netapp/home/gmcvicker/proj/WASP/examples/example_data/H3K27ac/samples.txt",
+
+    # directory containing BAM files
+    "bam_dir" : "/iblm/netapp/home/gmcvicker/proj/WASP/examples/example_data/H3K27ac",
+
+    # postfix of BAM files
+    "bam_postfix" : ".keep.rmdup.bam"
+}
diff --git a/CHT/update_het_probs.py b/CHT/update_het_probs.py
index 830f0b2..97f5ebe 100644
--- a/CHT/update_het_probs.py
+++ b/CHT/update_het_probs.py
@@ -52,7 +52,7 @@ def main():
     args = parse_options()
 
     if args.infile.endswith(".gz"):
-        infile = gzip.open(args.infile, "r")
+        infile = gzip.open(args.infile, "rt")
     else:
         infile = open(args.infile, "r")
         
diff --git a/CHT/update_total_depth.py b/CHT/update_total_depth.py
index 16f809b..889fd14 100644
--- a/CHT/update_total_depth.py
+++ b/CHT/update_total_depth.py
@@ -54,29 +54,37 @@ def parse_options():
 
     parser.add_argument("--seq", required=True,
                         help="Path to HDF5 file containing "
-                        "genome sequence. (Can be created "
-                        "using fasta2h5 program)",
+                        "genome sequence. Used to calculate GC content "
+                        "of each region. Can be created "
+                        "using fasta2h5 program.",
                         metavar="SEQ_H5_FILE")
     
     parser.add_argument("-i", "--fit_in_file", action='store',
                         dest='fit_in_file', default=None,
-                        help="read coefficients from specified file")
+                        help="Read coefficients from specified file "
+                        "rather than estimating them.")
     
     parser.add_argument("-o", "--fit_out_file", action='store',
                         dest='fit_out_file', default=None,
-                        help="only fit the model and write "
-                        "coefficients to specified file ")
+                        help="Estimate coefficients and write them "
+                        "to specified file, but do not adjust read counts.")
     
     parser.add_argument("-m", "--min_counts", action='store', 
-                        type=int,
-                        dest='min_counts', default=0,
-                        help="minimum counts to use row ")
+                        type=int, dest='min_counts', default=0,
+                        help="only use rows with at least min_counts for fitting")
     
     parser.add_argument("--skip", action='store', type=int,
                         dest='skips', default=0,
-                        help="lines to skip between each "
-                        "used for calculation")
+                        help="specify a number of rows to skip between each row "
+                        "used for estimating coefficients.")
+
+    dflt_sample = 10000
+    parser.add_argument("--sample", action='store', type=int,
+                        default=dflt_sample, help="randomly sample this many rows "
+                        "and use them for fitting coefficients. Specify 0 if all "
+                        "rows are to be used. (default=%d)" % dflt_sample)
     
+
     return parser.parse_args()
 
 
@@ -101,10 +109,11 @@ def main():
 
     sys.stderr.write("Updating totals\n")
     if args.fit_out_file:
-        write_splines(coefs_list,args.fit_out_file)
+        write_splines(coefs_list, args.fit_out_file)
     else:
         outlist = [line.strip() for 
                    line in open(args.outfile_list,"r")]
+        
         update_totals(inlist, outlist, count_table, coefs_list,
                       keep_list)
 
@@ -150,7 +159,7 @@ def get_at_gc_count(seq_h5, chrm, start, end):
 
     
 def load_data(inlist, seq_h5_filename, min_counts, skips):
-    infiles = open_files(inlist, "r")
+    infiles = open_files(inlist, "rt")
 
     seq_h5 = tables.openFile(seq_h5_filename, "r")
     
@@ -195,11 +204,12 @@ def load_data(inlist, seq_h5_filename, min_counts, skips):
             else:
                 num_NAs += 1
             for skip in range(skips):
+                # skip lines
                 line = infiles[ind].readline()
             line = infiles[ind].readline()
 
             if not line:
-                end_of_file=True
+                end_of_file = True
             else:
                 info_list[ind] = line.strip().split()
 
@@ -318,7 +328,7 @@ def splinefit(arg, x1, x2,  y):
 
 def update_totals(inlist, outlist, count_table, coefs_table,
                   keep_list):
-    infiles = open_files(inlist,"r")
+    infiles = open_files(inlist,"rt")
     outfiles = open_files(outlist,"w")
     row = 0
     count_row = 0
@@ -340,7 +350,9 @@ def update_totals(inlist, outlist, count_table, coefs_table,
                     has_NAs=True
                     break
 
-                adj_tot = calc_adjusted_totals(count_table[count_row,0], count_table[count_row, 1], coefs_table[ind])
+                adj_tot = calc_adjusted_totals(count_table[count_row, 0],
+                                               count_table[count_row, 1],
+                                               coefs_table[ind])
                 adj_tot_list.append(max(adj_tot, -1000000))
                 
             for ind in range(len(infiles)):
@@ -358,7 +370,7 @@ def update_totals(inlist, outlist, count_table, coefs_table,
             count_row+=1
         else:
             for ind in range(len(infiles)):
-                line=infiles[ind].readline()
+                line = infiles[ind].readline()
         row += 1
     for infile in infiles:
         infile.close()
diff --git a/README.md b/README.md
index 10b7c18..c170daf 100644
--- a/README.md
+++ b/README.md
@@ -26,16 +26,19 @@ Each directory contains its own README file:
 
 * [CHT](./CHT) - Code for running the Combined Haplotype Test
 
-* [mapping](./mapping) -Tools for correcting mapping biases
+* [mapping](./mapping) - Mappability filtering pipeline for correcting allelic mapping biases
 
 * [snp2h5](./snp2h5) - Contains snp2h5 and fasta2h5:  programs for converting
   common SNP and sequence data formats (IMPUTE, VCF and FASTA)
   to an efficient binary format, HDF5.
 
-* [example_data](./example_data) - Example data files that can be used to try out the
+* [examples](./examples) - Example data files that can be used to try out the
   Combined Haplotype Test.
 
-* [example_workflow.sh](./example_workflow.sh) - A script illustrating
+* [example_mapping_workflow.sh](./examples/example_mapping_workflow.sh) - A script illustrating
+  how each step of the Mappability Filtering Pipeline can be run. 
+
+* [example_cht_workflow.sh](./examples/example_cht_workflow.sh) - A script illustrating
   how each step of the Combined Haplotype Test workflow can be run.
 
 
@@ -59,9 +62,9 @@ The easiest way to install [HDF5](https://www.hdfgroup.org/HDF5/),
 [numpy](http://www.numpy.org), [scipy](http://scipy.org) and
 [Pytables](http://www.pytables.org/) is to download and install
 [Anaconda](http://continuum.io/downloads). *Installing Anaconda is
-highly recommended.* After installing Anaconda, the only dependency
-that must be downloaded and installed is
-[pysam](https://github.com/pysam-developers/pysam).
+highly recommended.* After installing Anaconda, configure [Bioconda] (https://bioconda.github.io/)
+and do `conda install pysam`, or download and install 
+[pysam](https://github.com/pysam-developers/pysam) directly.
 
 
 ## Installation
@@ -69,7 +72,13 @@ that must be downloaded and installed is
 1. Download and install [Anaconda](http://continuum.io/downloads),
 (or download and install Numpy, Scipy, HDF5, and Pytables separately).
 
-2. Download and install [pysam](https://github.com/pysam-developers/pysam)
+2. Configure Bioconda and install pysam:
+
+		conda config --add channels r
+		conda config --add channels bioconda
+		conda install pysam
+  Alternatively, download and install [pysam](https://github.com/pysam-developers/pysam)
+  yourself.
 
 3. Make sure that the HDF5 library is in your library path. For example 
 on Linux or OSX you can add the following to your .bashrc or .profile (replace
diff --git a/example_data/genotypes/snps/chr22.snps.txt.gz b/example_data/genotypes/snps/chr22.snps.txt.gz
deleted file mode 100644
index db75470..0000000
Binary files a/example_data/genotypes/snps/chr22.snps.txt.gz and /dev/null differ
diff --git a/example_workflow.sh b/examples/example_cht_workflow.sh
similarity index 100%
rename from example_workflow.sh
rename to examples/example_cht_workflow.sh
diff --git a/example_data/H3K27ac/18505.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/18505.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/18505.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/18505.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/18505.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/18505.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/18505.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/18505.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/18507.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/18507.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/18507.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/18507.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/18507.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/18507.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/18507.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/18507.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/18508.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/18508.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/18508.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/18508.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/18508.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/18508.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/18508.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/18508.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/18516.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/18516.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/18516.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/18516.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/18516.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/18516.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/18516.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/18516.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/18522.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/18522.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/18522.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/18522.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/18522.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/18522.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/18522.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/18522.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/19141.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/19141.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/19141.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/19141.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/19141.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/19141.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/19141.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/19141.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/19193.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/19193.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/19193.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/19193.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/19193.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/19193.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/19193.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/19193.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/19204.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/19204.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/19204.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/19204.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/19204.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/19204.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/19204.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/19204.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/19238.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/19238.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/19238.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/19238.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/19238.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/19238.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/19238.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/19238.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/19239.chr22.keep.rmdup.bam b/examples/example_data/H3K27ac/19239.chr22.keep.rmdup.bam
similarity index 100%
rename from example_data/H3K27ac/19239.chr22.keep.rmdup.bam
rename to examples/example_data/H3K27ac/19239.chr22.keep.rmdup.bam
diff --git a/example_data/H3K27ac/19239.chr22.keep.rmdup.bam.bai b/examples/example_data/H3K27ac/19239.chr22.keep.rmdup.bam.bai
similarity index 100%
rename from example_data/H3K27ac/19239.chr22.keep.rmdup.bam.bai
rename to examples/example_data/H3K27ac/19239.chr22.keep.rmdup.bam.bai
diff --git a/example_data/H3K27ac/alt_as_counts.18505.h5 b/examples/example_data/H3K27ac/alt_as_counts.18505.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.18505.h5
rename to examples/example_data/H3K27ac/alt_as_counts.18505.h5
diff --git a/example_data/H3K27ac/alt_as_counts.18507.h5 b/examples/example_data/H3K27ac/alt_as_counts.18507.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.18507.h5
rename to examples/example_data/H3K27ac/alt_as_counts.18507.h5
diff --git a/example_data/H3K27ac/alt_as_counts.18508.h5 b/examples/example_data/H3K27ac/alt_as_counts.18508.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.18508.h5
rename to examples/example_data/H3K27ac/alt_as_counts.18508.h5
diff --git a/example_data/H3K27ac/alt_as_counts.18516.h5 b/examples/example_data/H3K27ac/alt_as_counts.18516.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.18516.h5
rename to examples/example_data/H3K27ac/alt_as_counts.18516.h5
diff --git a/example_data/H3K27ac/alt_as_counts.18522.h5 b/examples/example_data/H3K27ac/alt_as_counts.18522.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.18522.h5
rename to examples/example_data/H3K27ac/alt_as_counts.18522.h5
diff --git a/example_data/H3K27ac/alt_as_counts.19141.h5 b/examples/example_data/H3K27ac/alt_as_counts.19141.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.19141.h5
rename to examples/example_data/H3K27ac/alt_as_counts.19141.h5
diff --git a/example_data/H3K27ac/alt_as_counts.19193.h5 b/examples/example_data/H3K27ac/alt_as_counts.19193.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.19193.h5
rename to examples/example_data/H3K27ac/alt_as_counts.19193.h5
diff --git a/example_data/H3K27ac/alt_as_counts.19204.h5 b/examples/example_data/H3K27ac/alt_as_counts.19204.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.19204.h5
rename to examples/example_data/H3K27ac/alt_as_counts.19204.h5
diff --git a/example_data/H3K27ac/alt_as_counts.19238.h5 b/examples/example_data/H3K27ac/alt_as_counts.19238.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.19238.h5
rename to examples/example_data/H3K27ac/alt_as_counts.19238.h5
diff --git a/example_data/H3K27ac/alt_as_counts.19239.h5 b/examples/example_data/H3K27ac/alt_as_counts.19239.h5
similarity index 100%
rename from example_data/H3K27ac/alt_as_counts.19239.h5
rename to examples/example_data/H3K27ac/alt_as_counts.19239.h5
diff --git a/example_data/H3K27ac/chr22.peaks.txt.gz b/examples/example_data/H3K27ac/chr22.peaks.txt.gz
similarity index 100%
rename from example_data/H3K27ac/chr22.peaks.txt.gz
rename to examples/example_data/H3K27ac/chr22.peaks.txt.gz
diff --git a/example_data/H3K27ac/cht_as_coef.txt b/examples/example_data/H3K27ac/cht_as_coef.txt
similarity index 100%
rename from example_data/H3K27ac/cht_as_coef.txt
rename to examples/example_data/H3K27ac/cht_as_coef.txt
diff --git a/example_data/H3K27ac/cht_bnb_coef.txt b/examples/example_data/H3K27ac/cht_bnb_coef.txt
similarity index 100%
rename from example_data/H3K27ac/cht_bnb_coef.txt
rename to examples/example_data/H3K27ac/cht_bnb_coef.txt
diff --git a/example_data/H3K27ac/cht_input_file.txt b/examples/example_data/H3K27ac/cht_input_file.txt
similarity index 100%
rename from example_data/H3K27ac/cht_input_file.txt
rename to examples/example_data/H3K27ac/cht_input_file.txt
diff --git a/example_data/H3K27ac/cht_results.txt b/examples/example_data/H3K27ac/cht_results.txt
similarity index 100%
rename from example_data/H3K27ac/cht_results.txt
rename to examples/example_data/H3K27ac/cht_results.txt
diff --git a/example_data/H3K27ac/get_PCs.R b/examples/example_data/H3K27ac/get_PCs.R
similarity index 100%
rename from example_data/H3K27ac/get_PCs.R
rename to examples/example_data/H3K27ac/get_PCs.R
diff --git a/example_data/H3K27ac/haplotype_read_counts.18505.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18505.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18505.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18505.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18505.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18505.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18505.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18505.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18505.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18505.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18505.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18505.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18507.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18507.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18507.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18507.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18507.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18507.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18507.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18507.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18507.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18507.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18507.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18507.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18508.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18508.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18508.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18508.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18508.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18508.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18508.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18508.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18508.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18508.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18508.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18508.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18516.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18516.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18516.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18516.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18516.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18516.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18516.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18516.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18516.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18516.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18516.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18516.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18522.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18522.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18522.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18522.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18522.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18522.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18522.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18522.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.18522.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.18522.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.18522.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.18522.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19141.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19141.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19141.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19141.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19141.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19141.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19141.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19141.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19141.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19141.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19141.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19141.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19193.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19193.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19193.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19193.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19193.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19193.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19193.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19193.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19193.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19193.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19193.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19193.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19204.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19204.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19204.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19204.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19204.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19204.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19204.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19204.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19204.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19204.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19204.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19204.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19238.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19238.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19238.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19238.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19238.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19238.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19238.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19238.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19238.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19238.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19238.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19238.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19239.adjusted.hetp.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19239.adjusted.hetp.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19239.adjusted.hetp.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19239.adjusted.hetp.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19239.adjusted.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19239.adjusted.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19239.adjusted.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19239.adjusted.txt.gz
diff --git a/example_data/H3K27ac/haplotype_read_counts.19239.txt.gz b/examples/example_data/H3K27ac/haplotype_read_counts.19239.txt.gz
similarity index 100%
rename from example_data/H3K27ac/haplotype_read_counts.19239.txt.gz
rename to examples/example_data/H3K27ac/haplotype_read_counts.19239.txt.gz
diff --git a/example_data/H3K27ac/other_as_counts.18505.h5 b/examples/example_data/H3K27ac/other_as_counts.18505.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.18505.h5
rename to examples/example_data/H3K27ac/other_as_counts.18505.h5
diff --git a/example_data/H3K27ac/other_as_counts.18507.h5 b/examples/example_data/H3K27ac/other_as_counts.18507.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.18507.h5
rename to examples/example_data/H3K27ac/other_as_counts.18507.h5
diff --git a/example_data/H3K27ac/other_as_counts.18508.h5 b/examples/example_data/H3K27ac/other_as_counts.18508.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.18508.h5
rename to examples/example_data/H3K27ac/other_as_counts.18508.h5
diff --git a/example_data/H3K27ac/other_as_counts.18516.h5 b/examples/example_data/H3K27ac/other_as_counts.18516.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.18516.h5
rename to examples/example_data/H3K27ac/other_as_counts.18516.h5
diff --git a/example_data/H3K27ac/other_as_counts.18522.h5 b/examples/example_data/H3K27ac/other_as_counts.18522.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.18522.h5
rename to examples/example_data/H3K27ac/other_as_counts.18522.h5
diff --git a/example_data/H3K27ac/other_as_counts.19141.h5 b/examples/example_data/H3K27ac/other_as_counts.19141.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.19141.h5
rename to examples/example_data/H3K27ac/other_as_counts.19141.h5
diff --git a/example_data/H3K27ac/other_as_counts.19193.h5 b/examples/example_data/H3K27ac/other_as_counts.19193.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.19193.h5
rename to examples/example_data/H3K27ac/other_as_counts.19193.h5
diff --git a/example_data/H3K27ac/other_as_counts.19204.h5 b/examples/example_data/H3K27ac/other_as_counts.19204.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.19204.h5
rename to examples/example_data/H3K27ac/other_as_counts.19204.h5
diff --git a/example_data/H3K27ac/other_as_counts.19238.h5 b/examples/example_data/H3K27ac/other_as_counts.19238.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.19238.h5
rename to examples/example_data/H3K27ac/other_as_counts.19238.h5
diff --git a/example_data/H3K27ac/other_as_counts.19239.h5 b/examples/example_data/H3K27ac/other_as_counts.19239.h5
similarity index 100%
rename from example_data/H3K27ac/other_as_counts.19239.h5
rename to examples/example_data/H3K27ac/other_as_counts.19239.h5
diff --git a/example_data/H3K27ac/read_counts.18505.h5 b/examples/example_data/H3K27ac/read_counts.18505.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.18505.h5
rename to examples/example_data/H3K27ac/read_counts.18505.h5
diff --git a/example_data/H3K27ac/read_counts.18507.h5 b/examples/example_data/H3K27ac/read_counts.18507.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.18507.h5
rename to examples/example_data/H3K27ac/read_counts.18507.h5
diff --git a/example_data/H3K27ac/read_counts.18508.h5 b/examples/example_data/H3K27ac/read_counts.18508.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.18508.h5
rename to examples/example_data/H3K27ac/read_counts.18508.h5
diff --git a/example_data/H3K27ac/read_counts.18516.h5 b/examples/example_data/H3K27ac/read_counts.18516.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.18516.h5
rename to examples/example_data/H3K27ac/read_counts.18516.h5
diff --git a/example_data/H3K27ac/read_counts.18522.h5 b/examples/example_data/H3K27ac/read_counts.18522.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.18522.h5
rename to examples/example_data/H3K27ac/read_counts.18522.h5
diff --git a/example_data/H3K27ac/read_counts.19141.h5 b/examples/example_data/H3K27ac/read_counts.19141.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.19141.h5
rename to examples/example_data/H3K27ac/read_counts.19141.h5
diff --git a/example_data/H3K27ac/read_counts.19193.h5 b/examples/example_data/H3K27ac/read_counts.19193.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.19193.h5
rename to examples/example_data/H3K27ac/read_counts.19193.h5
diff --git a/example_data/H3K27ac/read_counts.19204.h5 b/examples/example_data/H3K27ac/read_counts.19204.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.19204.h5
rename to examples/example_data/H3K27ac/read_counts.19204.h5
diff --git a/example_data/H3K27ac/read_counts.19238.h5 b/examples/example_data/H3K27ac/read_counts.19238.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.19238.h5
rename to examples/example_data/H3K27ac/read_counts.19238.h5
diff --git a/example_data/H3K27ac/read_counts.19239.h5 b/examples/example_data/H3K27ac/read_counts.19239.h5
similarity index 100%
rename from example_data/H3K27ac/read_counts.19239.h5
rename to examples/example_data/H3K27ac/read_counts.19239.h5
diff --git a/example_data/H3K27ac/ref_as_counts.18505.h5 b/examples/example_data/H3K27ac/ref_as_counts.18505.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.18505.h5
rename to examples/example_data/H3K27ac/ref_as_counts.18505.h5
diff --git a/example_data/H3K27ac/ref_as_counts.18507.h5 b/examples/example_data/H3K27ac/ref_as_counts.18507.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.18507.h5
rename to examples/example_data/H3K27ac/ref_as_counts.18507.h5
diff --git a/example_data/H3K27ac/ref_as_counts.18508.h5 b/examples/example_data/H3K27ac/ref_as_counts.18508.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.18508.h5
rename to examples/example_data/H3K27ac/ref_as_counts.18508.h5
diff --git a/example_data/H3K27ac/ref_as_counts.18516.h5 b/examples/example_data/H3K27ac/ref_as_counts.18516.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.18516.h5
rename to examples/example_data/H3K27ac/ref_as_counts.18516.h5
diff --git a/example_data/H3K27ac/ref_as_counts.18522.h5 b/examples/example_data/H3K27ac/ref_as_counts.18522.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.18522.h5
rename to examples/example_data/H3K27ac/ref_as_counts.18522.h5
diff --git a/example_data/H3K27ac/ref_as_counts.19141.h5 b/examples/example_data/H3K27ac/ref_as_counts.19141.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.19141.h5
rename to examples/example_data/H3K27ac/ref_as_counts.19141.h5
diff --git a/example_data/H3K27ac/ref_as_counts.19193.h5 b/examples/example_data/H3K27ac/ref_as_counts.19193.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.19193.h5
rename to examples/example_data/H3K27ac/ref_as_counts.19193.h5
diff --git a/example_data/H3K27ac/ref_as_counts.19204.h5 b/examples/example_data/H3K27ac/ref_as_counts.19204.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.19204.h5
rename to examples/example_data/H3K27ac/ref_as_counts.19204.h5
diff --git a/example_data/H3K27ac/ref_as_counts.19238.h5 b/examples/example_data/H3K27ac/ref_as_counts.19238.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.19238.h5
rename to examples/example_data/H3K27ac/ref_as_counts.19238.h5
diff --git a/example_data/H3K27ac/ref_as_counts.19239.h5 b/examples/example_data/H3K27ac/ref_as_counts.19239.h5
similarity index 100%
rename from example_data/H3K27ac/ref_as_counts.19239.h5
rename to examples/example_data/H3K27ac/ref_as_counts.19239.h5
diff --git a/example_data/H3K27ac/samples.txt b/examples/example_data/H3K27ac/samples.txt
similarity index 100%
rename from example_data/H3K27ac/samples.txt
rename to examples/example_data/H3K27ac/samples.txt
diff --git a/example_data/README.md b/examples/example_data/README.md
similarity index 100%
rename from example_data/README.md
rename to examples/example_data/README.md
diff --git a/example_data/chromInfo.hg19.txt b/examples/example_data/chromInfo.hg19.txt
similarity index 100%
rename from example_data/chromInfo.hg19.txt
rename to examples/example_data/chromInfo.hg19.txt
diff --git a/example_data/geno_probs.h5 b/examples/example_data/geno_probs.h5
similarity index 100%
rename from example_data/geno_probs.h5
rename to examples/example_data/geno_probs.h5
diff --git a/example_data/genotypes/YRI_samples.txt b/examples/example_data/genotypes/YRI_samples.txt
similarity index 100%
rename from example_data/genotypes/YRI_samples.txt
rename to examples/example_data/genotypes/YRI_samples.txt
diff --git a/example_data/genotypes/chr22.hg19.impute2.gz b/examples/example_data/genotypes/chr22.hg19.impute2.gz
similarity index 100%
rename from example_data/genotypes/chr22.hg19.impute2.gz
rename to examples/example_data/genotypes/chr22.hg19.impute2.gz
diff --git a/example_data/genotypes/chr22.hg19.impute2_haps.gz b/examples/example_data/genotypes/chr22.hg19.impute2_haps.gz
similarity index 100%
rename from example_data/genotypes/chr22.hg19.impute2_haps.gz
rename to examples/example_data/genotypes/chr22.hg19.impute2_haps.gz
diff --git a/example_data/haps.h5 b/examples/example_data/haps.h5
similarity index 100%
rename from example_data/haps.h5
rename to examples/example_data/haps.h5
diff --git a/example_data/sim_pe_reads1.fastq.gz b/examples/example_data/sim_pe_reads1.fastq.gz
similarity index 100%
rename from example_data/sim_pe_reads1.fastq.gz
rename to examples/example_data/sim_pe_reads1.fastq.gz
diff --git a/example_data/sim_pe_reads2.fastq.gz b/examples/example_data/sim_pe_reads2.fastq.gz
similarity index 100%
rename from example_data/sim_pe_reads2.fastq.gz
rename to examples/example_data/sim_pe_reads2.fastq.gz
diff --git a/example_data/snp_index.h5 b/examples/example_data/snp_index.h5
similarity index 100%
rename from example_data/snp_index.h5
rename to examples/example_data/snp_index.h5
diff --git a/example_data/snp_tab.h5 b/examples/example_data/snp_tab.h5
similarity index 100%
rename from example_data/snp_tab.h5
rename to examples/example_data/snp_tab.h5
diff --git a/examples/example_data/test_chr1.snps.txt.gz b/examples/example_data/test_chr1.snps.txt.gz
new file mode 100644
index 0000000..f4d056d
Binary files /dev/null and b/examples/example_data/test_chr1.snps.txt.gz differ
diff --git a/examples/example_data/test_chr2.snps.txt.gz b/examples/example_data/test_chr2.snps.txt.gz
new file mode 100644
index 0000000..925f28b
Binary files /dev/null and b/examples/example_data/test_chr2.snps.txt.gz differ
diff --git a/examples/example_data/test_genome.fa b/examples/example_data/test_genome.fa
new file mode 100644
index 0000000..aa5942c
--- /dev/null
+++ b/examples/example_data/test_genome.fa
@@ -0,0 +1,6 @@
+>test_chr1
+ACTGACATAGATACGTCACGTACGACACGA
+AACCATGCACAGTTGCACGGTATCACGGTT
+>test_chr2
+ACATTTCATCTACAGTATGGATTTACATTT
+AACCATGCACAGTTGCACGGTATCACGGTT
diff --git a/examples/example_data/test_reads1.fq b/examples/example_data/test_reads1.fq
new file mode 100644
index 0000000..4f7575c
--- /dev/null
+++ b/examples/example_data/test_reads1.fq
@@ -0,0 +1,8 @@
+@test_chr1_1
+ACTGACATAGATACGTCACGTACGACACGA
++
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@test_chr2_1
+ACATTTCATCTACAGTATGGATTTACATTT
++
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
\ No newline at end of file
diff --git a/examples/example_data/test_reads2.fq b/examples/example_data/test_reads2.fq
new file mode 100644
index 0000000..fc702f7
--- /dev/null
+++ b/examples/example_data/test_reads2.fq
@@ -0,0 +1,8 @@
+@test_chr1_1
+AACCGTGATACCGTGCAACTGTGCATGGTT
++
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@test_chr2_1
+AACCGTGATACCGTGCAACTGTGCATGGTT
++
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
\ No newline at end of file
diff --git a/mapping/example_mapping_workflow.sh b/examples/example_mapping_workflow.sh
similarity index 54%
rename from mapping/example_mapping_workflow.sh
rename to examples/example_mapping_workflow.sh
index 0aac0bb..143d614 100644
--- a/mapping/example_mapping_workflow.sh
+++ b/examples/example_mapping_workflow.sh
@@ -2,54 +2,57 @@
 # Set these environment vars to point to
 # your local installation of WASP
 WASP=$HOME/proj/WASP
-DATA_DIR=$WASP/example_data
-SNP_DIR=$DATA_DIR/genotypes/snps
+DATA_DIR=$WASP/examples/example_data
 
 # These environment vars point to the reference genome and bowtie2.
 # in the examples below, the reference genome is assumed
 # to be indexed for use with bowtie2
-INDEX=/data/external_public/reference_genomes/hg19/hg19
-BOWTIE=$HOME/bowtie2-2.2.5
+INDEX=$HOME/data1/external/GRC37/combined/bowtie2_index/hg37
+BOWTIE=$HOME/anaconda2/bin/bowtie2
 
 
-# First make files containing lists of SNPs. Each file should be
-# named chr<#>.snps.txt.gz  (e.g. chr22.snps.txt.gz) and contain 3 columns:
-# position, ref allele, alt allele.
-#
-# Here is an example of creating SNP files from impute2 output files:
-# mkdir -p $SNP_DIR
-# for FILE in /data/internal/Yoruba/IMPUTE/CEU/hg19/*impute2.gz; do
-#     echo $FILE >&2
-#     CHR=`echo $FILE | sed -n 's/^.*\(chr[0-9A-Z]*\).*.impute2.gz$/\1/p'`
-#     echo $CHR >&2
-#     OUTPUT_FILE=$SNP_DIR/$CHR.snps.txt.gz
-#     gunzip -c $FILE | awk '{print $3,$4,$5}' | gzip > $OUTPUT_FILE
-# done
+$WASP/snp2h5/snp2h5 --chrom $DATA_DIR/chromInfo.hg19.txt \
+		    --format impute \
+		    --snp_index $DATA_DIR/genotypes/snp_index.h5 \
+		    --geno_prob $DATA_DIR/genotypes/geno_probs.h5 \
+		    --snp_tab $DATA_DIR/genotypes/snp_tab.h5 \
+		    --haplotype $DATA_DIR/genotypes/haps.h5 \
+		    --samples $DATA_DIR/genotypes/YRI_samples.txt \
+		    $DATA_DIR/genotypes/chr*.hg19.impute2.gz \
+		    $DATA_DIR/genotypes/chr*.hg19.impute2_haps.gz
 
 
 # Map reads using bowtie2 (or another mapping tool of your choice)
-$BOWTIE/bowtie2 -x $INDEX -1 $DATA_DIR/sim_pe_reads1.fastq.gz \
-		-2 $DATA_DIR/sim_pe_reads2.fastq.gz \
+$BOWTIE -x $INDEX -1 $DATA_DIR/sim_pe_reads1.fastq.gz \
+	-2 $DATA_DIR/sim_pe_reads2.fastq.gz \
     | samtools view -S -b -q 10 - > $DATA_DIR/sim_pe_reads.bam
 
 # Pull out reads that need to be remapped to check for bias
 # Use the -p option for paired-end reads.
-python $WASP/mapping/find_intersecting_snps.py -p $DATA_DIR/sim_pe_reads.bam $SNP_DIR
+python $WASP/mapping/find_intersecting_snps.py \
+       --is_paired_end \
+       --output_dir $DATA_DIR  \
+       --snp_index $DATA_DIR/genotypes/snp_index.h5 \
+       --snp_tab $DATA_DIR/genotypes/snp_tab.h5 \
+       --haplotype $DATA_DIR/genotypes/haps.h5 \
+       --samples $DATA_DIR/H3K27ac/samples.txt \
+       $DATA_DIR/sim_pe_reads.bam
 
 # Remap the reads, using same the program and options as before.
 # NOTE: If you use an option in the first mapping step that modifies the
 # reads (e.g. the -5 read trimming option to bowtie2) you should omit this
 # option during the second mapping step here (otherwise the reads will be modified
 # twice)!
-$BOWTIE/bowtie2 -x $INDEX -1 $DATA_DIR/sim_pe_reads.remap.fq1.gz \
-		-2 $DATA_DIR/sim_pe_reads.remap.fq2.gz \
+$BOWTIE -x $INDEX -1 $DATA_DIR/sim_pe_reads.remap.fq1.gz \
+	          -2 $DATA_DIR/sim_pe_reads.remap.fq2.gz \
     | samtools view -S -b -q 10 - >  $DATA_DIR/sim_pe_reads.remap.bam
 
 # Use filter_remapped_reads.py to create filtered list of reads that correctly
 # remap to same position
-python $WASP/mapping/filter_remapped_reads.py -p $DATA_DIR/sim_pe_reads.to.remap.bam \
-       $DATA_DIR/sim_pe_reads.remap.bam $DATA_DIR/sim_pe_reads.remap.keep.bam \
-       $DATA_DIR/sim_pe_reads.to.remap.num.gz
+python $WASP/mapping/filter_remapped_reads.py \
+       $DATA_DIR/sim_pe_reads.to.remap.bam \
+       $DATA_DIR/sim_pe_reads.remap.bam \
+       $DATA_DIR/sim_pe_reads.remap.keep.bam
 
 # Create a merged BAM containing [1] reads that did
 # not need remapping [2] filtered remapped reads
@@ -58,7 +61,7 @@ samtools merge $DATA_DIR/sim_pe_reads.keep.merged.bam \
 
 # Sort and index the bam file
 samtools sort $DATA_DIR/sim_pe_reads.keep.merged.bam \
-	 $DATA_DIR/sim_pe_reads.keep.merged.sorted
+	 -o $DATA_DIR/sim_pe_reads.keep.merged.sorted.bam
 	 
 samtools index $DATA_DIR/sim_pe_reads.keep.merged.sorted.bam
 
diff --git a/mapping/README.md b/mapping/README.md
index baa0113..2869c61 100644
--- a/mapping/README.md
+++ b/mapping/README.md
@@ -1,106 +1,269 @@
 Pipeline for mappability filtering
 ==================================
 
-This directory contains scripts that can be used to eliminate mapping bias from mapped allele-specific
-reads.  First, reads are mapped normally using a mapper chosen by the user (must output BAM or 
-SAM format).  Then mapped reads that overlap single nucleotide polymorphisms (SNPs) are identified. For each read that overlaps a SNP, its genotype is swapped with that of the other allele and the read is re-mapped. Re-mapped reads that fail to map to exactly the same location in the genome are discarded.
+This directory contains scripts that can be used to eliminate mapping
+bias from mapped allele-specific reads.  First, reads are mapped
+normally using a mapper chosen by the user (must output BAM or SAM
+format).  Then mapped reads that overlap single nucleotide
+polymorphisms (SNPs) are identified. For each read that overlaps a
+SNP, its genotype is swapped with that of the other allele and the
+read is re-mapped. Re-mapped reads that fail to map to exactly the
+same location in the genome are discarded.
 
-Step 1: 
+
+SnakeMake
+---------
+
+We now provide a Snakemake workflow that can be used to run the entire
+mappability filtering pipeline. For more information see the
+[Snakemake README](README.snakemake.md)
+
+Step 1:
+-------
+
+Create input files containing genetic variants (SNPs). The WASP
+pipeline can take SNP input files in HDF5 format or in a text-based format.
+The HDF5 format is preferred as it contains phase information that improves
+the performance of the mappability filtering pipeline. (If you do not
+have phasing information, then the text-based format should be used.)
+
+### Creating HDF5 SNP files
+
+Convert SNP data to HDF5 format using the program snp2h5.  HDF5 files
+are an efficient binary data format used by WASP scripts. The snp2h5
+program can take VCF or IMPUTE2 files as input.
+
+If VCF files are used, the sample names are read directly from the
+header lines of the VCF file. If IMPUTE2 files are used, then
+the names of the samples should be provided in a separate text file.
+
+    	# using an IMPUTE input file:
+       ./snp2h5/snp2h5 --chrom example_data/chromInfo.hg19.txt \
+              --format impute \
+              --geno_prob example_data/geno_probs.h5 \
+              --snp_index example_data/snp_index.h5 \
+              --snp_tab example_data/snp_tab.h5 \
+              --haplotype example_data/haps.h5 \
+              --samples samples_names.txt
+              example_data/genotypes/chr*.hg19.impute2.gz \
+              example_data/genotypes/chr*.hg19.impute2_haps.gz
+
+       # using VCF files:
+       ./snp2h5/snp2h5 --chrom data/ucsc/hg19/chromInfo.txt.gz \
+             --format vcf \
+             --haplotype haplotypes.h5 \
+             --snp_index snp_index.h5 \
+             --snp_tab   snp_tab.h5 \
+             data/1000G/ALL.chr*.vcf.gz
+
+       # read separate VCF files that contain genotype likelihoods (GL)
+       # or genotype probabilities (GP) if these were not in the main
+       # VCF file
+       ./snp2h5/snp2h5 --chrom data/ucsc/hg19/chromInfo.txt.gz \
+            --format vcf \
+            --geno_prob geno_probs.h5 \
+            1000G/supporting/genotype_likelihoods/shapeit2/ALL.chr*.gl.vcf.gz
+
+### Creating text-based SNP files
+
+The text-based input files have three space-delimited columns
+(position, ref_allele, alt_allele), and one input file per chromosome.
+The filenames must contain the name of the chromosome (e.g. chr2).
+
+We provide example scripts that can be used to create these files
+from IMPUTE or VCF files:
+
+       # get SNPs from IMPUTE files:
+       ./mapping/extract_impute_snps.sh example_data/genotypes/ output_snp_dir/
+
+       # get SNPs from VCF files:
+       ./mapping/extract_vcf_snps.sh example_data/genotypes/ output_snp_dir/
+
+
+
+Step 2: 
 -------
 
-Map the fastq files using your favorite mapper/options and filter for quality using a cutoff of your choice
+Map the fastq files using your favorite mapper/options and filter for
+quality using a cutoff of your choice
 
 ### Example:
-	tophat --no-coverage-search -o ${LANE_NAME}_out Sequences/hg18_norand ${LANE_NAME}.fastq.gz
-	samtools view -b -q 10 ${LANE_NAME}_out/accepted_hits.bam > ${LANE_NAME}_out/accepted_hits.quality.bam
+       bowtie2 -x bowtie2_index/hg37 -1 ${SAMPLE_NAME}_1.fq.gz \
+               -2 ${SAMPLE_NAME}_2.fq.gz \
+               | samtools view -b -q 10 - > map1/${SAMPLE_NAME}.bam
+       samtools sort -o map1/${SAMPLE_NAME}.sort.bam map1/${SAMPLE_NAME}.bam
+       samtools index map1/${SAMPLE_NAME}.sort.bam
+
 
-Step 2:
+Step 3:
 -------
 
 Use find_intersecting_snps.py to identify reads that may have mapping biases
 
-#### Usage: 
-	find_intersecting_snps.py [-p] <input.bam> <SNP_file_directory>
-	   -p indicates that reads are paired-end (default is single)
-	   -m changes the maximum window to search for SNPs.  The default is 
-	      100,000 base pairs.  Reads or read pairs that span more than this distance 
-	      (usually due to splice junctions) will be thrown out.  Increasing this window 
-	      allows for longer junctions, but may increase run time and memory requirements.
-	   <input.bam> is the bamfile from the initial mapping process
-	   <SNP_file_directory> is the directory containing the SNPs segregating within the 
-	      sample in question (which need to be checked for mappability issues).  This directory 
-	      should contain sorted files of SNPs separated by chromosome and named:
-	         chr<#>.snps.txt.gz
-	      These files should contain 3 columns: position RefAllele AltAllele
+### Example:
+       python mapping/find_intersecting_snps.py \
+              --is_paired_end \
+              --is_sorted \
+              --output_dir find_intersecting_snps \
+              --snp_tab snp_tab.h5 \
+              --snp_index snp_index.h5 \
+              --haplotype haplotype.h5 \
+              --samples my_samples.txt \
+              map1/${SAMPLE_NAME}.sort.bam
+
+#### Usage:
+       positional arguments:
+            bam_filename          Coordinate-sorted input BAM file containing
+                                  mapped reads.
+
+       optional arguments:
+             -h, --help            show this help message and exit
+             --is_paired_end, -p   Indicates that reads are paired-end (default
+                                   is single).
+             --is_sorted, -s       Indicates that the input BAM file is
+	                           coordinate-sorted (default is False).
+             --max_seqs MAX_SEQS   The maximum number of sequences with 
+                                   different allelic combinations to consider
+                                   remapping (default=64). Read pairs wi
+                                   more allelic combinations than MAX_SEQs are
+                                   discarded
+             --max_snps MAX_SNPS   The maximum number of SNPs allowed to
+                                   overlap a read before discarding the read.
+                                   Allowing higher numbers will decrease speed
+                                   and increase memory usage (default=6).
+             --output_dir OUT_DIR  Directory to write output files to. If not
+                                   specified, output files are written to the
+                                   same directory as the input BAM file.
+             --snp_dir SNP_DIR     Directory containing SNP text files This
+                                   directory should contain one file per
+                                   chromosome named like chr<#>.snps.txt
+                                   Each file should contain 3 columns: position
+                                   RefAllele AltAllele. This option should
+                                   only be used if --snp_tab, --snp_index,
+                                   and --haplotype arguments are not used.
+                                   If this argument is provided, all possible
+                                   allelic combinations are used (rather
+                                   than set of observed haplotypes)
+             --snp_tab SNP_TABLE_H5_FILE
+                                   Path to HDF5 file to read SNP info from. Each
+                                   row of SNP table contains SNP name, position,
+                                   allele1, allele2.
+             --snp_index SNP_INDEX_H5_FILE
+                                   Path to HDF5 file containing SNP index.
+                                   The SNP index is used to convert the
+                                   genomic position of a SNP to
+                                   its corresponding row in the haplotype and
+                                   snp_tab HDF5 files.
+             --haplotype HAPLOTYPE_H5_FILE
+                                   Path to HDF5 file to read phased haplotypes
+                                   from. When generating alternative reads use
+                                   known haplotypes from this file rather than
+                                   all possible allelic combinations.
+             --samples SAMPLES     Use only haplotypes and SNPs that are
+                                   polymorphic in these samples. SAMPLES can
+                                   either be a comma-delimited string of sample
+                                   names or a path to a file with one
+                                   sample name per line (file is assumed to be
+                                   whitespace-delimited and first column is
+                                   assumed to be sample name). Sample names
+                                   should match those present in the
+                                   --haplotype file. Samples are ignored if no
+                                   haplotype file is provided.
 
 
 #### Output:
-	input.sort.bam - Sorted bamfile of the original input
-	input.keep.bam - bamfile with reads that did not intersect SNPs or indels and therefore can 
-	   be kept without remapping
-	input.to.remap.bam - bamfile with original reads that overlapped SNPs that need to be remapped
-	input.to.remap.num.gz - the number of variants of the original read that must be remapped
-	input.remap.fq.gz - fastq file containing the reads with the new variants to remap. If the 
-	    paired-end option is used two files ending with .fq1.gz and .fq2.gz will be output.
-	    
-	Note: Reads that overlap indels are currently excluded and will not be present in any of the 'remap' files
-	or the input.keep.bam file. For this reason the total number of reads will not add up to the 
-	number of reads provided in the input.sort.bam file.
+         PREFIX.keep.bam - bamfile with reads that did not intersect SNPs
+                          or indels that can be kept without remapping
+         PREFIX.to.remap.bam - bamfile with original reads that overlapped SNPs
+                          that need to be remapped
+         PREFIX.remap.fq.gz - fastq file containing the reads with flipped
+                          alleles to remap. If paired-end option is used
+                          two files ending with .fq1.gz and .fq2.gz are output.
+         (PREFIX is the name of the input file, excluding the trailing .bam)
+	
+         Note: Reads that overlap indels are currently excluded and
+         will not be present in any of the 'remap' files or the
+         input.keep.bam file. For this reason the total number of reads
+         will not add up to the number of reads provided in the
+         input.sort.bam file.
 
-#### Example:
-	python find_intersecting_snps.py ${LANE_NAME}_out/accepted_hits.quality.bam SNP_files/
 
-Step 3
+Step 4
 -----
-Map the input.remap.fq.gz using the same mapping arguments used in Step 1. Note that 
-the arguments should be exactly the same as those in Step 1 EXCEPT for arguments that 
-directly modify the reads that are used by the aligner. For example the read trimming 
-arguments to bowtie (-3 and -5 arguments) should be used in Step 1 ONLY because
-they modify the reads that are output by bowtie.
+Map the PREFIX.remap.fq.gz using the same mapping arguments used in
+Step 1. Note that the arguments should be exactly the same as those in
+Step 1 EXCEPT for arguments that directly modify the reads that are
+used by the aligner. For example the read trimming arguments to bowtie
+(-3 and -5 arguments) should be used in Step 1 ONLY, because they
+modify the reads that are output by bowtie.
 
-#### Example:
-	tophat --no-coverage-search -o ${LANE_NAME}_out_remap hg18_norand ${LANE_NAME}_out/accepted_hits.quality.remap.fq.gz
-	samtools view -b -q 10 ${LANE_NAME}_out_remap/accepted_hits.bam > ${LANE_NAME}_out_remap/accepted_hits.quality.bam
+### Example:
+         bowtie2 -x bowtie2_index/hg37 \
+                   -1 find_intersecting_snps/${SAMPLE_NAME}_1.remap.fq.gz \
+                   -2 find_intersecting_snps/${SAMPLE_NAME}_2.remap.fq.gz \
+               | samtools view -b -q 10 - > map2/${SAMPLE_NAME}.bam
+         samtools sort -o map2/${SAMPLE_NAME}.sort.bam map2/${SAMPLE_NAME}.bam
+         samtools index map2/${SAMPLE_NAME}.sort.bam
 
 
-Step 4
+Step 5
 ------
-Use filter_remapped_reads.py to retrieve reads that remapped correctly
+Use filter_remapped_reads.py to filter out reads where one or more
+of the allelic versions of the reads fail to map back to the same
+location as the original read.
 
 #### Usage:
-	filter_remapped_reads.py [-p] <to.remap.bam> <remapped_reads.bam> <output.bam> <to.remap.num.gz>
-	   -p option indicates that the reads are paired-end
-	   <to.remap.bam> output from find_intersecting_snps.py which contains 
-	      the original aligned reads that were remapped
-	   <remapped_reads.bam> output from the second mapping step (Step 3)
-	   <output.bam> file where reads that are kept after remapping are stored
-	   <to.remap.num.gz> is the file from find_intersecting_snps.py which contains 
-	      the number of remapped sequences
+         filter_remapped_reads.py [-h] to_remap_bam remap_bam keep_bam
+       
+         positional arguments:
+           to_remap_bam  input BAM file containing original set of reads that
+           needed to be remapped after having their alleles
+			 flipped. This file is output by the
+			 find_intersecting_snps.py script.
+           remap_bam     input BAM file containing remapped reads (with flipped
+                         alleles)
+           keep_bam      output BAM file to write filtered set of reads to
 
 #### Example:
-	filter_remapped_reads.py ${LANE_NAME}_out/accepted_hits.quality.to.remap.bam ${LANE_NAME}_out_remap/accepted_hits.quality.bam ${LANE_NAME}.remap.keep.bam ${LANE_NAME}_out/accepted_hits.quality.to.remap.num.gz
+         python mapping/filter_remapped_reads.py \
+           find_intersection_snps/${SAMPLE_NAME}.to.remap.bam \
+           map2/${SAMPLE_NAME}.sort.bam \
+           filter_remapped_reads/${SAMPLE_NAME}.keep.bam
 
-At the end of the pipeline, ${LANE_NAME}.keep.bam and ${LANE_NAME}.remap.keep.bam 
-can be merged for a complete set of mappability filtered aligned reads. The merged
-file should then be sorted and indexed:
 
-	samtools merge ${LANE_NAME}.keep.merged.bam ${LANE_NAME}.keep.bam ${LANE_NAME}.remap.keep.bam
-	samtools sort ${LANE_NAME}.keep.merged.bam ${LANE_NAME}.keep.merged.sorted
-	samtools index ${LANE_NAME}.keep.merged.sorted.bam
+Step 6
+------
+
+Merge  ${SAMPLE_NAME}.keep.bam and ${SAMPLE_NAME}.remap.keep.bam 
+can be merged for a complete set of mappability-filtered aligned reads.
+The merged file should then be sorted and indexed:
+
+#### Example:
+         samtools merge merge/${SAMPLE_NAME}.keep.merge.bam \
+                  filter_remapped_reads/${SAMPLE_NAME}.keep.bam  \
+                  find_intersecting_snps/${SAMPLE_NAME}.keep.bam
+         samtools sort -o  merge/${SAMPLE_NAME}.keep.merge.sort.bam \
+                  merge/${SAMPLE_NAME}.keep.merge.bam 
+         samtools index ${SAMPLE_NAME}.keep.merged.sort.bam
 
 
-Step 5 (Optional)
+Step 7
 ------
 
-Filter duplicate reads. Programs such as samtools rmdup introduce bias when they filter duplicate reads because they
-retain the read with the highest score (which usually matches the reference). We provide a script rmdup.py which performs unbiased removal of duplicate reads. The script discards duplicate reads at random (independent of their score). The input BAM or SAM file must be sorted.
+Filter duplicate reads. Programs such as samtools rmdup introduce bias
+when they filter duplicate reads because they retain the read with the
+highest score (which usually matches the reference). We provide a
+script rmdup.py which performs unbiased removal of duplicate
+reads. The script discards duplicate reads at random (independent of
+their score). The input BAM or SAM file must be sorted.
 
 #### Usage:
-	# for single end reads:
-	python rmdup.py <sorted.input.bam> <output.bam>
-	# for paired-end reads:
-	python rmdup_pe.py <sorted.input.bam> <output.bam>
+         # for single end reads:
+         python rmdup.py <sorted.input.bam> <output.bam>
+         # for paired-end reads:
+         python rmdup_pe.py <sorted.input.bam> <output.bam>
 	
 ## Testing
 
-To run the tests, execute `py.test` from within this directory.
+To run the tests, execute `py.test` from within the mapping directory.
+The tests currently require bowtie2 and samtools to be in the PATH.
diff --git a/mapping/README.snakemake.md b/mapping/README.snakemake.md
new file mode 100644
index 0000000..b34f3c7
--- /dev/null
+++ b/mapping/README.snakemake.md
@@ -0,0 +1,118 @@
+## Snakemake Mappability Filtering Pipeline
+
+[Snakemake](https://bitbucket.org/snakemake/snakemake/wiki/Home) is a
+workflow management system, designed to streamline the execution of
+software pipelines. We now provide a Snakemake rule file that can be
+used to run the entire Mappability Filtering Pipeline.
+
+For a more complete description of Snakemake see the
+[Snakemake tutorial](http://snakemake.bitbucket.org/snakemake-tutorial.html).
+
+## Installing Snakemake
+
+Snakemake requires python3, however the CHT pipeline requires
+python2. For this reason, if you are using
+[Anaconda](https://www.continuum.io/downloads), it is recommended that
+you create a [python3
+environment](http://conda.pydata.org/docs/py2or3.html#create-a-python-3-5-environment). For example you can create a python3.5 Anaconda environment with the following shell command (this only needs to be done once):
+
+        conda create -n py35 python=3.5 anaconda
+
+You can then activate the py35 environment, and install the latest version of
+Snakemake with the following commands:
+
+        source activate py35
+        conda install snakemake
+
+Then when you want to switch back to your default (e.g. python2) environment
+do the following:
+
+        source deactivate
+
+
+
+## Configuring the Mappability Filtering Pipeline
+
+The rules for the Snakemake tasks are defined in the [Snakefile](Snakefile).
+
+Configuration parameters for this Snakefile are read from the YAML file
+[snake_conf.yaml](snake_conf.yaml).
+
+Before running Snakemake, edit this file to specify the location
+of all of the input directories and files that will be used by the pipeline.
+This includes locations of the impute2 or VCF SNP files, input BAM files etc.
+
+Importantly you must set `wasp_dir` to point to the location of WASP
+on your system, and set `py2` to setup the environment
+for python (e.g. by modifying your PATH) and call the
+appropriate interpreter.  This is necessary because Snakemake is run
+using python3, but most of the scripts require python2.
+
+
+## Running the Mappability Filtering pipeline
+
+Snakemake can be run as a single process or on a compute cluster with
+multiple jobs running simultaneuously. To run Snakemake on a single node
+you could do something like the following:
+
+        source activate py35
+        cd $WASP_DIR/CHT
+        snakemake
+
+We provide a script [run_snakemake.sh](run_snakemake.sh) to run Snakemake
+on a SGE compute cluster. You must be in a python3 environment to run this
+script, and the script must be run from a job submission host.
+
+        source activate py35
+        cd $WASP_DIR/CHT
+        ./run_snakemake.sh
+
+It should be possible to make simple modifications to this script to
+run on queue management systems other than SGE (e.g. LSF or Slurm).
+
+
+You should Snakemake from within a [Screen](https://www.gnu.org/software/screen/) virtual terminal or using [nohup](https://en.wikipedia.org/wiki/Nohup) so
+that if you are disconnected from the cluster, Snakemake will continue to run.
+
+## Output from the Mappability Filtering Pipeline
+
+The Snakemake pipeline creates the following directories under the
+output_dir specified in snake_conf.yaml. The rmdup and as_counts directories
+contain the final output.
+
+* map1 - results from first mapping of reads to genome
+* map1_sort - sorted BAM files from first mapping
+* find_intersecting_snps - results from find_intersecting_snps.py
+* map2 - results from second mapping of reads to genome
+* map2_sort - sorted BAM files from second mapping
+* filter_remapped_reads - results from filter_remapped_reads.py
+* merge - merged reads that should be kept because they did not
+        overlap a SNP or because they overlapped a SNP and all
+	alleles remapped to the same location.
+* rmdup - Final BAM files with duplicate reads removed. Only *sort* files
+        need to be kept.
+* as_counts - allele specific read counts at polymorphic SNPs
+
+Once the pipeline is complete, most of the files can be removed to save space:
+
+       # remove unsorted files from rmdup dir:
+       ls rmdup/ | grep -v sort | xargs rm
+       # remove intermediate files and directories:
+       rm -rf map1 map1_sort find_intersecting_snps map2 map2_sort filter_remapped_reads merge
+
+
+## Debugging the Snakemake jobs
+
+By default Snakemake will write an output and error file for each job
+to your home directory. These files will be named like `snakejob.<rulename>.<job_num>.sh.{e|o}<sge_jobid>`. For example:
+
+   	# contains error output for extract_haplotype_read_counts rule:
+   	snakejob.find_intersecting_snps_paired_end.13.sh.e4507125
+
+If a rule fails, you should check the appropriate output file to see what
+error occurred. A major benefit of Snakemake is that if you re-run snakemake
+after a job fails it will pickup where it left off.
+
+You should make sure that all of the existing jobs have been killed before
+re-starting the pipeline.
+
diff --git a/mapping/Snakefile b/mapping/Snakefile
new file mode 100644
index 0000000..6ec5d45
--- /dev/null
+++ b/mapping/Snakefile
@@ -0,0 +1,247 @@
+
+configfile: "snake_conf.yaml"
+
+import glob
+
+
+def read_samples():
+    """Function to get names and fastq paths from a sample file specified
+    in the configuration. Input file is expected to have 4 columns:
+    <1000genomes_id> <unique_sample_id> <fastq1_path> <fastq2_path>. Modify this function
+    as needed to provide a dictionary of sample_id keys and (fastq1, fastq1) 
+    values"""
+    f = open(config['sample_file'], "r")
+    samp_dict = {}
+    for line in f:
+        words = line.strip().split()
+        samp_dict[words[1]] = (words[2], words[3])
+
+    return samp_dict
+
+def read_1kg_samples():
+    f = open(config['sample_file'], "r")
+    samp_dict = {}
+    for line in f:
+        words = line.strip().split()
+        samp_dict[words[1]] = words[0]
+
+    return samp_dict
+
+SAMP_TO_1KG = read_1kg_samples()
+
+
+
+def get_chromosomes():
+    """Gets list of chromosomes with from VCF files"""
+    filenames = os.listdir(config['vcf_dir'])
+
+    chr_names = set([])
+    for filename in filenames:
+        if filename.endswith(".vcf.gz"):
+            m = re.match(".*(chr[0-9A-Z]+).*", filename)
+            if m:
+                chr_names.add(m.groups()[0])
+
+    return chr_names
+    
+
+rule all:
+    input:
+        expand(config['output_dir'] + "/as_counts/{sample}.as_counts.txt.gz",
+               sample=read_samples().keys())
+
+
+        
+## Here are rules to use text input files for SNPs instead of
+## HDF5 files (that include haplotype information)
+#
+# rule extract_vcf_snps:
+#     """Makes SNP input files for find_intersecting_snps.py by extracting
+#     information from VCF files."""
+#     input:
+#         config["vcf_dir"]
+#     output:
+#         expand(config["snp_dir"] + "/{chromosome}.snps.txt.gz", chromosome=get_chromosomes())
+#     shell:
+#         "mkdir -p {config[snp_dir]} ; "
+#         "{config[wasp_dir]}/mapping/extract_vcf_snps.sh {input} {config[snp_dir]}"
+#
+# rule find_intersecting_snps_paired_end:
+#     """find intersecting SNPs using WASP script"""
+#     input:
+#         bam=config["output_dir"] + "/map1_sort/{sample}.bam",
+#         snps=[config["snp_dir"] + "/%s.snps.txt.gz" % s for s in get_chromosomes()]
+#     output:
+#         fastq1=config["output_dir"] + "/find_intersecting_snps/{sample}.remap.fq1.gz",
+#         fastq2=config["output_dir"] + "/find_intersecting_snps/{sample}.remap.fq2.gz",
+#         keep_bam=config["output_dir"] + "/find_intersecting_snps/{sample}.keep.bam",
+#         remap_bam=config["output_dir"] + "/find_intersecting_snps/{sample}.to.remap.bam"
+#     shell:
+#         "mkdir -p {config[output_dir]}/find_intersecting_snps ; "
+#         "{config[py2]} {config[wasp_dir]}/mapping/find_intersecting_snps.py "
+#         "    --is_paired_end --is_sorted --output_dir {config[output_dir]}/find_intersecting_snps {input.bam} {config[snp_dir]}"
+
+
+rule vcf2h5:
+    """Convert VCF data files to HDF5 format"""
+    input:
+        chrom=config['chrom_info'],
+        vcfs=glob.glob(config['vcf_dir'] + "/*chr*.vcf.gz")
+    output:
+        snp_index=config['snp_h5_dir'] + "/snp_index.h5",
+        snp_tab=config['snp_h5_dir'] + "/snp_tab.h5",
+        haplotype=config['snp_h5_dir'] + "/haplotype.h5"
+    shell:
+        "mkdir -p {config[snp_h5_dir]}; "
+        "{config[wasp_dir]}/snp2h5/snp2h5 "
+        "  --chrom {input.chrom} "
+        "  --format vcf "
+        "  --snp_index {output.snp_index} "
+        "  --snp_tab {output.snp_tab} "
+        "  --haplotype {output.haplotype} "
+        "  {input.vcfs}"
+
+
+rule find_intersecting_snps_paired_end:
+     """find intersecting SNPs using WASP script"""
+     input:
+         bam=config["output_dir"] + "/map1_sort/{sample}.bam",
+         snp_index=config["snp_h5_dir"] + "/snp_index.h5",
+         snp_tab=config["snp_h5_dir"] + "/snp_tab.h5",
+         haplotype=config['snp_h5_dir'] + "/haplotype.h5"
+     output:
+         fastq1=config["output_dir"] + "/find_intersecting_snps/{sample}.remap.fq1.gz",
+         fastq2=config["output_dir"] + "/find_intersecting_snps/{sample}.remap.fq2.gz",
+         keep_bam=config["output_dir"] + "/find_intersecting_snps/{sample}.keep.bam",
+         remap_bam=config["output_dir"] + "/find_intersecting_snps/{sample}.to.remap.bam"
+     shell:
+         "mkdir -p {config[output_dir]}/find_intersecting_snps ; "
+         "{config[py2]} {config[wasp_dir]}/mapping/find_intersecting_snps.py "
+         "    --is_paired_end "
+         "    --is_sorted "
+         "    --output_dir {config[output_dir]}/find_intersecting_snps "
+         "    --snp_tab {input.snp_tab} "
+         "    --snp_index {input.snp_index} "
+         "    --haplotype {input.haplotype} "
+         "    --samples {config[sample_file]} "
+         "    {input.bam}"
+
+
+         
+rule map_bowtie2_paired_end1:
+    """map reads using bowtie2"""
+    input:
+        fastq1=lambda wildcards: read_samples()[wildcards.sample][0],
+        fastq2=lambda wildcards: read_samples()[wildcards.sample][1]
+    output:
+        config["output_dir"] + "/map1/{sample}.bam"
+    shell:
+        "mkdir -p " + config["output_dir"] + "/map1 ; "
+        "{config[bowtie2]} -x {config[bowtie2_index]} -1 {input.fastq1} -2 {input.fastq2} "
+        "| {config[samtools]} view -b -q 10 - > {output} "
+
+
+rule sort_and_index_bam1:
+    """sort and index bam generated by first mapping step"""
+    input:
+        config["output_dir"] + "/map1/{sample}.bam"
+    output:
+        config["output_dir"] + "/map1_sort/{sample}.bam",
+        config["output_dir"] + "/map1_sort/{sample}.bam.bai"
+    shell:
+        "mkdir -p {config[output_dir]}/map1_sort ; "
+        "{config[samtools]} sort -o {output[0]} {input}; "
+        "{config[samtools]} index {output[0]}"
+
+
+
+rule map_bowtie2_paired_end2:
+    """map reads a second time using bowtie2"""
+    input:
+        fastq1=config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz",
+        fastq2=config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz"
+    output:
+        config["output_dir"] + "/map2/{sample}.bam"
+    shell:
+        "mkdir -p " + config["output_dir"] + "/map2 ; "
+        "{config[bowtie2]} -x {config[bowtie2_index]} -1 {input.fastq1} -2 {input.fastq2} "
+        "| {config[samtools]} view -b -q 10 - > {output}"
+        
+
+rule sort_and_index_bam2:
+    """sort and index bam generated by second mapping step"""
+    input:
+        config["output_dir"] + "/map2/{sample}.bam"
+    output:
+        config["output_dir"] + "/map2_sort/{sample}.bam",
+        config["output_dir"] + "/map2_sort/{sample}.bam.bai"
+    shell:
+        "mkdir -p {config[output_dir]}/map2_sort ; "
+        "{config[samtools]} sort -o {output[0]} {input} ; "
+        "{config[samtools]} index {output[0]}"
+
+
+rule filter_remapped_reads:
+    """filter reads from second mapping step"""
+    input:
+        to_remap_bam=config['output_dir'] + "/find_intersecting_snps/{sample}.to.remap.bam",
+        remap_bam=config['output_dir'] + "/map2_sort/{sample}.bam",
+    output:
+        keep_bam=config['output_dir'] + "/filter_remapped_reads/{sample}.keep.bam"
+    shell:
+        "mkdir -p {config[output_dir]}/filter_remapped_reads ; "
+        "{config[py2]} {config[wasp_dir]}/mapping/filter_remapped_reads.py "
+        "  {input.to_remap_bam} {input.remap_bam} {output.keep_bam}"
+
+    
+rule merge_bams:
+    """merge 'keep' BAM files from mapping steps 1 and 2, then sort and index"""
+    input:
+        keep1=config['output_dir'] + "/find_intersecting_snps/{sample}.keep.bam",
+        keep2=config['output_dir'] + "/filter_remapped_reads/{sample}.keep.bam"
+    output:
+        merge=config['output_dir'] + "/merge/{sample}.keep.merge.bam",
+        sort=config['output_dir'] + "/merge/{sample}.keep.merge.sort.bam"
+    shell:
+        "mkdir -p {config[output_dir]}/merge ; "
+        "{config[samtools]} merge {output.merge} {input.keep1} {input.keep2}; "
+        "{config[samtools]} sort -o {output.sort} {output.merge}; "
+        "{config[samtools]} index {output.sort}"
+
+    
+rule rmdup_pe:
+    """remove duplicate read pairs"""
+    input:
+        config['output_dir'] + "/merge/{sample}.keep.merge.sort.bam"
+    output:
+        rmdup=config['output_dir'] + "/rmdup/{sample}.keep.merge.rmdup.bam",
+        sort=config['output_dir'] + "/rmdup/{sample}.keep.merge.rmdup.sort.bam"
+    shell:
+        "mkdir -p {config[output_dir]}/rmdup ; "
+        "{config[py2]} {config[wasp_dir]}/mapping/rmdup_pe.py {input} {output.rmdup} ;"
+        "{config[samtools]} sort -o {output.sort} {output.rmdup}; "
+        "{config[samtools]} index {output.sort}"
+        
+
+        
+rule get_as_counts:
+    """get allele-specific read counts for SNPs"""
+    input:
+         bam=config['output_dir'] + "/rmdup/{sample}.keep.merge.rmdup.sort.bam",
+         snp_index=config["snp_h5_dir"] + "/snp_index.h5",
+         snp_tab=config["snp_h5_dir"] + "/snp_tab.h5",
+         haplotype=config['snp_h5_dir'] + "/haplotype.h5",
+    params:
+         samp1kg=lambda wildcards: SAMP_TO_1KG[wildcards.sample]
+    output:
+        config['output_dir'] + "/as_counts/{sample}.as_counts.txt.gz"
+    shell:
+        "mkdir -p {config[output_dir]}/as_counts ; "
+        "{config[py2]} {config[wasp_dir]}/mapping/get_as_counts.py "
+        "  --snp_tab {input.snp_tab} "
+        "  --snp_index {input.snp_index} "
+        "  --haplotype {input.haplotype} "
+        "  --samples {config[sample_file]} "
+        "  --genotype_sample {params.samp1kg} "
+        "  {input.bam} | gzip > {output}"
+
diff --git a/mapping/extract_impute_snps.sh b/mapping/extract_impute_snps.sh
new file mode 100755
index 0000000..6cb9317
--- /dev/null
+++ b/mapping/extract_impute_snps.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+#
+# This script takes IMPUTE-formatted files and
+# creates files that can be used by the find_intersecting_snps.py script. 
+# The script takes an INPUT directory and an OUTPUT directory.
+#
+# The INPUT directory is expected to contain files ending with .impute2.gz
+# and to contain the name of the chromosome (like chr22 or chr1).
+#
+# OUTPUT files are named $CHR.snps.txt.gz (where $CHR is the name of the
+# chromosome). OUTPUT files contain <position> <allele1> <allele2> on
+# each line.
+#
+
+INPUT_DIR=$1
+OUTPUT_DIR=$2
+
+if [ ! $INPUT_DIR ]; then
+    echo "usage: extract_inpute_snps.sh <input_dir> <output_dir>" >&2
+    exit 2
+fi
+
+if [ ! $OUTPUT_DIR ]; then
+    echo "usage: extract_inpute_snps.sh <input_dir> <output_dir>" >&2
+    exit 2
+fi
+
+mkdir -p $OUTPUT_DIR
+
+for FILE in $INPUT_DIR/*impute2.gz; do
+     echo $FILE >&2
+     CHR=`echo $FILE | sed -n 's/^.*\(chr[0-9A-Z]*\).*.impute2.gz$/\1/p'`
+     echo $CHR >&2
+     OUTPUT_FILE=$OUTPUT_DIR/$CHR.snps.txt.gz
+     gunzip -c $FILE | awk '{print $3,$4,$5}' | gzip > $OUTPUT_FILE
+done
+
diff --git a/mapping/extract_vcf_snps.sh b/mapping/extract_vcf_snps.sh
new file mode 100755
index 0000000..e190a82
--- /dev/null
+++ b/mapping/extract_vcf_snps.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+#
+# This script takes VCF files and generates files that can be used
+# by the find_intersecting_snps.py script. 
+# The script takes an INPUT directory and an OUTPUT directory.
+#
+# The INPUT directory is expected to contain files ending with .vcf.gz
+# and to contain the name of the chromosome (like chr22 or chr1).
+#
+# OUTPUT files are named $CHR.snps.txt.gz (where $CHR is the name of the
+# chromosome). OUTPUT files contain <position> <allele1> <allele2> on
+# each line.
+#
+
+INPUT_DIR=$1
+OUTPUT_DIR=$2
+
+if [ ! $INPUT_DIR ]; then
+    echo "usage: extract_vcf_snps.sh <input_dir> <output_dir>" >&2
+    exit 2
+fi
+
+if [ ! $OUTPUT_DIR ]; then
+    echo "usage: extract_vcf_snps.sh <input_dir> <output_dir>" >&2
+    exit 2
+fi
+
+mkdir -p $OUTPUT_DIR
+
+for FILE in $INPUT_DIR/*vcf.gz; do
+     echo $FILE >&2
+     CHR=`echo $FILE | sed -n 's/^.*\(chr[0-9A-Z]*\).*.vcf.gz$/\1/p'`
+     echo $CHR >&2
+     OUTPUT_FILE=$OUTPUT_DIR/$CHR.snps.txt.gz
+     gunzip -c $FILE | egrep -v "^#" | awk '{print $2,$4,$5}' | gzip > $OUTPUT_FILE
+done
+
diff --git a/mapping/filter_remapped_reads.py b/mapping/filter_remapped_reads.py
index f533f20..788aed2 100644
--- a/mapping/filter_remapped_reads.py
+++ b/mapping/filter_remapped_reads.py
@@ -1,181 +1,181 @@
-def run(to_remap_bam, remap_bam, keep_bam, orig_num_file, is_paired_end):
-    import gzip
-    import sys
-
-    import pysam
-
-    to_remap_bam = pysam.Samfile(to_remap_bam, "rb")
-    remap_bam = pysam.Samfile(remap_bam, "rb")
-    keep_bam = pysam.Samfile(keep_bam, "wb", template=to_remap_bam)
-    orig_num_file = gzip.open(orig_num_file)
-
-    # correct_maps is a list of reads that mapped correctly. The read is
-    # is represented by its "ID number" (which is the order that it was seen in
-    # the original input bam file and also the first number in its new read
-    # name, e.g. the 42 in @42:chr1:14536:1). For paired end, both reads will
-    # have the same ID number. This means that the first read pair will have ID
-    # number 1, the second will be 2, and so on. For a read or read pair to be
-    # remapped correctly, the ID number X of that read/read pair should show up
-    # N amount of times in correct_maps where N is the value in orig_num_file at
-    # line X (one-based).
-    correct_maps = []
-    end_of_file = False
+
+import argparse
+import sys
+
+import pysam
+
+
+
+def parse_options():
+    parser = argparse.ArgumentParser(description="This program checks "
+                                     "whether reads that overlap SNPs map "
+                                     "back to the same location as the "
+                                     "original reads after their alleles "
+                                     "are flipped by the "
+                                     "find_intersecting_snps.py script. "
+                                     "Reads where one or more allelic versions "
+                                     "map to a different location (or fail "
+                                     "to map) are discarded. Reads that are "
+                                     "kept are written to the specified "
+                                     "keep_bam output file. Reads in the "
+                                     "input remap_bam file are expected to "
+                                     "have read names encoding the original "
+                                     "map location and number of allelic "
+                                     "variants. Specifically, the read names "
+                                     "should be delimited with the '.' "
+                                     "character and "
+                                     "contain the following fields: "
+                                     "<orig_name>.<coordinate>."
+                                     "<read_number>.<total_read_number>. "
+                                     "These read names are "
+                                     "generated by the "
+                                     "find_intersecting_snps.py script.")
     
-    # Get a list of reads that remapped correctly.
-    remap_read = remap_bam.next()
+    parser.add_argument("to_remap_bam", help="input BAM file containing "
+                        "original set of reads that needed to "
+                        "be remapped after having their alleles flipped."
+                        " This file is output by the find_intersecting_snps.py "
+                        "script.")
+    parser.add_argument("remap_bam", help="input BAM file containing "
+                        "remapped reads (with flipped alleles)")
+    parser.add_argument("keep_bam", help="output BAM file to write "
+                        "filtered set of reads to")
+
+    return parser.parse_args()
+
+
+
+
+def filter_reads(remap_bam):
+    # dictionary to keep track of how many times a given read is observed
+    read_counts = {}
+
+    # names of reads that should be kept
+    keep_reads = set([])
+    bad_reads = set([])
     
-    while not end_of_file:
-        """
-        The read names in the fastq files for remapping consist of four or
-        five parts (depending on single (1:chr1:763045:1) or paired end data
-        (1:chr1:763006:763045:3)) separated by colons. The first part is the
-        remap number. The first read or read pair from the input file to be
-        remapped is 1, the second read pair is 2, etc. The second part is the
-        chromosome name. The third part is the alignment position of the
-        original read pair. If paired end data, the third part is the
-        alignment position of the left read (relative to the reference) and
-        the fourth part is the position of the right read. This is
-        irrespective of which read is R1 and R2. The left reads are written
-        into the R1 fastq file and the right reads are written into the R2
-        fastq file. The last part is the number of alternative sequences for
-        the read pair (i.e. the number of new sequences with swapped alleles
-        we have to map to see if they match with the alignment of the original
-        sequence). Note that if a read pair overlaps multiple SNPs and has
-        multiple alternate sequences, each of those alternate sequences will
-        have the exact same read name.
-        """
-
-        chrm = remap_read.qname.strip().split(":")[1]
+    for read in remap_bam:
+        if read.is_secondary:
+            # only keep primary alignments and discard 'secondary' alignments
+            continue
         
-        if remap_read.is_reverse:
-            if is_paired_end:
-                pos = int(remap_read.qname.strip().split(":")[3])
+        # parse name of read, which should contain:
+        # 1 - the original name of the read
+        # 2 - the coordinate that it should map to
+        # 3 - the number of the read
+        # 4 - the total number of reads being remapped
+        words = read.qname.split(".")
+        if len(words) < 4:
+            raise ValueError("expected read names to be formatted "
+                             "like <orig_name>.<coordinate>."
+                             "<read_number>.<total_read_number> but got "
+                             "%s" % read.qname)
+
+        # token separator '.' can potentially occur in
+        # original read name, so if more than 4 tokens,
+        # assume first tokens make up original read name
+        orig_name = ".".join(words[0:len(words)-3])
+        coord_str, num_str, total_str = words[len(words)-3:]
+        num = int(num_str)
+        total = int(total_str)
+
+        correct_map = False
+        
+        if '-' in coord_str:
+            # paired end read, coordinate gives expected positions for each end
+            c1, c2 = coord_str.split("-")
+
+            if not read.is_paired:
+                bad_reads.add(orig_name)
+                continue
+            if not read.is_proper_pair:
+                bad_reads.add(orig_name)
+                continue
+            
+            pos1 = int(c1)
+            pos2 = int(c2)
+            if pos1 < pos2:
+                left_pos = pos1
+                right_pos = pos2
             else:
-                pos = int(remap_read.qname.strip().split(":")[2])
+                left_pos = pos2
+                right_pos = pos1
+                
+            # only use left end of reads, but check that right end is in
+            # correct location
+            if read.pos < read.next_reference_start:
+                if pos1 == read.pos+1 and pos2 == read.next_reference_start+1:
+                    # both reads mapped to correct location
+                    correct_map = True
+            else:
+                # this is right end of read
+                continue   
         else:
-            pos = int(remap_read.qname.strip().split(":")[2])
-        read_num = int(remap_read.qname.strip().split(":")[0])
-        if (remap_read.tid != -1 and remap_read.pos == pos and 
-            remap_bam.getrname(remap_read.tid) == chrm):
-            dels = 0
-            # Throw out the remapped read if it remapped with a deletion...for
-            # now.
-            for cig in remap_read.cigar:
-                if not cig[0] in (0, 3, 4):
-                    dels += 1
-            if dels == 0:
-                correct_maps.append(read_num)
-        try:
-            remap_read = remap_bam.next()
-        except:
-            end_of_file = True
-    
-    correct_maps.sort()
-
-    # Pull out original aligned reads if all of the alternatives mapped
-    # correctly.
-    orig_read = to_remap_bam.next()
-    # orig_num is the number of different reads generated from the original read
-    # (pair) (depends on number of alleles it overlapped). For paired end data,
-    # this is number is at least two (one for each read) and will always be a
-    # multiple of two.
-    orig_num = int(orig_num_file.readline().strip())
-    # Line number of the remap_bam file (if single end data) or read pair
-    # number if paired end data.
-    line_num = 1
-    
-    # Index for walking through correct_maps.
-    map_indx = 0
-    # Number of correctly mapped reads for the current read (pair).
-    correct = 0
-    # Total number of correctly mapped read (pairs).
-    total_correct = 0
-    end_of_file = False
-   
-    # The idea here is that we will walk through remap_bam and check to see
-    # whether all of the possible reads spawned from the original read (pair)
-    # were mapped consistently with the original read (pair). Since correct_maps
-    # is sorted, it should look something like [1, 1, 2, 2, 3, 3, 3, 3, ...].
-    # If this example is for paired end data, this would mean that two reads
-    # from the first pair mapped correctly, two reads from the second read pair
-    # mapped correctly, four reads from the third read pair mapped correctly,
-    # and so on. If the number of reads that mapped correctly for a given ID
-    # number is equal to the corresponding orig_num, then everything is
-    # consistent and we keep the read pair. For instance, in the example above,
-    # correct_maps starts out with [1, 1]. This means that we have two correctly
-    # mapped reads for the read pair with ID number 1 (the first read pair). If
-    # the number on the first line of orig_num_file is 2, then we keep the read
-    # pair because both reads mapped correctly. If the first number of
-    # orig_num_file was 4, then we wouldn't keep the first read pair because
-    # two of the reads didn't map correctly.
-    while (not end_of_file and 
-           (map_indx < len(correct_maps)) and 
-           (line_num <= correct_maps[-1])):
-        if line_num != correct_maps[map_indx]:
-            # If we saw the correct number of remaps for the last read, we can
-            # keep it.
-            if correct == orig_num:
-                total_correct += 1
-                keep_bam.write(orig_read)
-                # If the data is paired end, write out the paired read.
-                if is_paired_end:
-                    try:
-                        orig_read = to_remap_bam.next()
-                    except:
-                        sys.stderr.write("File ended unexpectedly (no pair found).")
-                        exit()
-                    keep_bam.write(orig_read)
+            # single end read
+            pos = int(coord_str)
+
+            if pos == read.pos+1:
+                # read maps to correct location
+                correct_map = True
+
+        if correct_map:
+            if orig_name in read_counts:
+                read_counts[orig_name] += 1
             else:
-                try:
-                    second_read = to_remap_bam.next()
-                except:
-                    end_of_file=True
-                    break
-
-            try:
-                orig_read = to_remap_bam.next()
-                orig_num = int(orig_num_file.readline().strip())
-            except StopIteration:
-                end_of_file = True
-            line_num += 1
-            correct = 0
+                read_counts[orig_name] = 1
+
+            if read_counts[orig_name] == total:
+                # all alternative versions of this read
+                # mapped to correct location
+                if orig_name in keep_reads:
+                    raise ValueError("saw read %s more times than "
+                                     "expected in input file" % orig_name)
+                keep_reads.add(orig_name)
+
+                # remove read from counts to save mem
+                del read_counts[orig_name]
+        else:
+            # read maps to different location
+            bad_reads.add(orig_name)
+
+    return keep_reads, bad_reads
+    
+
+
+
+def write_reads(to_remap_bam, keep_bam, keep_reads, bad_reads):
+
+    keep_count = 0
+    bad_count = 0
+    discard_count = 0
+
+    for read in to_remap_bam:
+        if read.qname in bad_reads:
+            bad_count += 1
+        elif read.qname in keep_reads:
+            keep_count += 1
+            keep_bam.write(read)
         else:
-            correct += 1
-            map_indx += 1
-
-    if correct == orig_num:
-        total_correct += 1
-        keep_bam.write(orig_read)
-        # If the data is paired end, write out the paired read.
-        if is_paired_end:
-            try:
-                orig_read = to_remap_bam.next()
-            except:
-                sys.stderr.write("File ended unexpectedly (no pair found).")
-                exit()
-            keep_bam.write(orig_read)
-    sys.stderr.write('%d read (pair)s remapped to the correct position\n' %
-                     total_correct)
-
-def main():
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-p", action='store_true', dest='is_paired_end', 
-                        default=False, help=('Indicates that reads are '
-                                             'paired-end (default is single).'))
-    h = ('to.remap.bam file from find_intersecting_snps.py.')
-    parser.add_argument("to_remap_bam", help=h)
-    parser.add_argument("remap_bam", help='Remapped bam file.')
-    parser.add_argument("keep_bam", help=('File to write correctly remapped '
-                                          'reads to.'))
-    h = 'to.remap.num.gz file from find_intersecting_snps.py.'
-    parser.add_argument("orig_num_file", help=h)
+            discard_count += 1
+
+    sys.stderr.write("keep_reads: %d\n" % keep_count)
+    sys.stderr.write("bad_reads: %d\n" % bad_count)
+    sys.stderr.write("discard_reads: %d\n" % discard_count)
     
-    options = parser.parse_args()
+
     
-    run(options.to_remap_bam, options.remap_bam, options.keep_bam,
-        options.orig_num_file, options.is_paired_end)
+def main(to_remap_bam_path, remap_bam_path, keep_bam_path):
+    to_remap_bam = pysam.Samfile(to_remap_bam_path)
+    remap_bam = pysam.Samfile(remap_bam_path)
+    keep_bam = pysam.Samfile(keep_bam_path, "wb", template=to_remap_bam)
+
+    keep_reads, bad_reads = filter_reads(remap_bam)
+    
+    write_reads(to_remap_bam, keep_bam, keep_reads, bad_reads)
+        
+
+
+if __name__ == "__main__":
+    options = parse_options()
+    main(options.to_remap_bam, options.remap_bam, options.keep_bam)
 
-if __name__ == '__main__':
-    main()
diff --git a/mapping/find_intersecting_snps.py b/mapping/find_intersecting_snps.py
index 1f0273a..dbef29a 100644
--- a/mapping/find_intersecting_snps.py
+++ b/mapping/find_intersecting_snps.py
@@ -1,811 +1,932 @@
-import argparse
-import gzip
 import sys
+import os
+import gzip
+import argparse
+import numpy as np
 
-import array
 import pysam
 
+import util
+import snptable
+
+import tables
 
-MAX_WINDOW_DEFAULT = 100000
+MAX_SEQS_DEFAULT = 64
+MAX_SNPS_DEFAULT = 6
 
-class SNP:
-    """SNP objects hold data for a single SNP"""
 
-    def __init__(self, snp_line):
-        """
-        Initialize SNP object.
+class DataFiles(object):
+    """Object to hold names and filehandles for all input / output 
+    datafiles"""
+    
+    def __init__(self, bam_filename, is_sorted, is_paired,
+                 output_dir=None, snp_dir=None,
+                 snp_tab_filename=None, snp_index_filename=None,
+                 haplotype_filename=None, samples=None):
+        # flag indicating whether reads are paired-end
+        self.is_paired = is_paired
         
-        Parameters:
-        -----------
+        # prefix for output files
+        self.prefix = None
+
+        # name of input BAM filename
+        self.bam_filename = bam_filename        
+        # name of sorted input bam_filename
+        # (new file is created if input file is not
+        #  already sorted)
+        self.bam_sort_filename = None
+        # pysam file handle for input BAM
+        self.input_bam = None
+
+        # name of output keep and to.remap BAM files
+        self.keep_filename = None
+        self.remap_filename = None
+
+        # pysam file handles for output BAM filenames
+        self.keep_bam = None
+        self.remap_bam = None
 
-        snp_line : str
-            Line from SNP file.
+                
+        # name of output fastq files
+        self.fastq_single_filename = None
+        self.fastq1_filename = None
+        self.fastq2_filename = None
+        self.fastq1 = None
+        self.fastq2 = None
+        self.fastq_single = None
+
+        # name of directory to read SNPs from
+        self.snp_dir = snp_dir
 
-        Attributes:
-        -----------
+        # paths to HDF5 files to read SNP info from
+        self.snp_tab_filename = snp_tab_filename
+        self.snp_index_filename = snp_index_filename
+        self.haplotype_filename = haplotype_filename
 
-        pos : int
-            Genomic position of SNP.
+        if self.snp_tab_filename:
+            self.snp_tab_h5 = tables.openFile(snp_tab_filename, "r")
+            self.snp_index_h5 = tables.openFile(snp_index_filename, "r")
+            self.hap_h5 = tables.openFile(haplotype_filename, "r")
+        else:
+            self.snp_tab_h5 = None
+            self.snp_index_h5 = None
+            self.hap_h5 = None
 
-        alleles : list
-            List of two alleles.
+            
+        # separate input directory and bam filename
+        tokens = self.bam_filename.split("/")
+        bam_dir = "/".join(tokens[:-1])
+        filename = tokens[-1]
+
+        if output_dir is None:
+            # if no output dir specified, use same directory as input
+            # bam file
+            output_dir = bam_dir
+        else:
+            if output_dir.endswith("/"):
+                # strip trailing '/' from output dir name
+                output_dir = output_dir[:-1]
+                
+        name_split = filename.split(".")
+        if len(name_split) > 1:
+           self.prefix = output_dir + "/" + ".".join(name_split[:-1])
+        else:
+            self.prefix = output_dir + "/" + name_split[0]
+            
+        # TODO: could allow names of output files to be specified
+        # on command line rather than appending name to prefix
+        sys.stderr.write("prefix: %s\n" % self.prefix)
+        
+        if not is_sorted:
+            util.sort_bam(self.bam_filename, self.prefix)
+            self.bam_sort_filename = self.prefix + ".sort.bam"
+        else:
+            self.bam_sort_filename = self.bam_filename
 
-        ptype : str
-            Type of polymorphism (snp or indel). If there are multiple alleles
-            and one is an indel, ptype will be "indel". If the alleles are all
-            single nucleotide variants, ptype will be "snp".
+        self.keep_filename = self.prefix + ".keep.bam"
+        self.remap_filename = self.prefix + ".to.remap.bam"
+
+        sys.stderr.write("reading reads from:\n  %s\n" %
+                         self.bam_sort_filename)
+        
+        sys.stderr.write("writing output files to:\n")
+        
+        if self.is_paired:
+            self.fastq1_filename = self.prefix + ".remap.fq1.gz"
+            self.fastq2_filename = self.prefix + ".remap.fq2.gz"
+            self.fastq1 = gzip.open(self.fastq1_filename, "wb")
+            self.fastq2 = gzip.open(self.fastq2_filename, "wb")
+            self.fastq_single_filename = self.prefix + ".remap.single.fq.gz"
+            self.fastq_single = gzip.open(self.fastq_single_filename, "wb")
+            sys.stderr.write("  %s\n  %s\n  %s\n" %
+                             (self.fastq1_filename,
+                              self.fastq2_filename,
+                              self.fastq_single_filename))
+            
+        else:
+            self.fastq_single_filename = self.prefix + ".remap.fq.gz"
+            self.fastq_single = gzip.open(self.fastq_single_filename, "wb")
+            sys.stderr.write("  %s\n" % (self.fastq_single_filename))
+
+        self.input_bam = pysam.Samfile(self.bam_sort_filename, "rb")
+        self.keep_bam = pysam.Samfile(self.keep_filename, "wb",
+                                      template=self.input_bam)
+        self.remap_bam = pysam.Samfile(self.remap_filename, "wb",
+                                       template=self.input_bam)
+        sys.stderr.write("  %s\n  %s\n" % (self.keep_filename,
+                                           self.remap_filename))
 
-        max_len : int
-            Maximum allele length. If greater than one, implies an insertion.
 
-        """
-        snp_split = snp_line.strip().split()
-        self.pos = int(snp_split[0]) - 1
-        self.alleles = [snp_split[1], snp_split[2]]
-        self.ptype = "snp"
-        self.max_len = 0
+    
         
-        for i in range(len(self.alleles)):
-            if self.alleles[i] == "-":
-                self.alleles[i] = ""
-                self.ptype = "indel"
-            elif len(self.alleles[i]) > self.max_len:
-                self.max_len = len(self.alleles[i])
-                
-        if self.max_len > 1:
-            self.ptype = "indel"
+    def close(self):
+        """close open filehandles"""
+        filehandles = [self.keep_bam, self.remap_bam, self.fastq1,
+                       self.fastq2, self.fastq_single,
+                       self.snp_tab_h5, self.snp_index_h5,
+                       self.hap_h5]
+
+        for fh in filehandles:
+            if fh:
+                fh.close()
+
+        
+class ReadStats(object):
+    """Track information about reads and SNPs that they overlap"""
+
+    def __init__(self):
+        # number of read matches to reference allele
+        self.ref_count = 0
+        # number of read matches to alternative allele
+        self.alt_count = 0
+        # number of reads that overlap SNP but match neither allele
+        self.other_count = 0
+
+        # number of reads discarded because not proper pair
+        self.discard_improper_pair = 0
+
+        # paired reads map to different chromosomes
+        self.discard_different_chromosome = 0
+
+        # number of reads discarded because overlap an indel
+        self.discard_indel = 0
+
+        # number of reads discarded because secondary match
+        self.discard_secondary = 0
+
+        # number of reads discarded because of too many overlapping SNPs
+        self.discard_excess_snps = 0
         
-    def add_allele(self, new_alleles):
-        """
-        Add new alleles for a snp or indel.
+        # number of reads discarded because too many allelic combinations
+        self.discard_excess_reads = 0
+        
+        # reads where we expected to see other pair, but it was missing
+        # possibly due to read-pairs with different names
+        self.discard_missing_pair = 0
+        
+        # number of single reads kept
+        self.keep_single = 0
+        # number of read pairs kept
+        self.keep_pair = 0
+
+        # number of single reads that need remapping
+        self.remap_single = 0
+        # number of read pairs kept
+        self.remap_pair = 0
         
-        Parameters:
-        -----------
-
-        new_alleles : list
-            List of alleles (each allele is a string like "A").
-
-        """
-        # If a string is passed, each element of the string will be added as an
-        # allele.
-        assert type(new_alleles) is list
-        for new_allele in new_alleles:
-            if new_allele == "-":
-                self.ptype = "indel"
-                new_allele = ""
-            # Only add the new allele if it doesn't already exist.
-            if not (new_allele in self.alleles):
-                self.alleles.append(new_allele)
-                if len(new_allele) > self.max_len:
-                    self.max_len = len(new_allele)
-                    
-        if self.max_len > 1:
-            self.ptype = "indel"
-
-    def shift_indel(self):
-        """
-        Currently not used anywhere.
-        """
-        self.pos += 1
-        self.max_len -= 1
-        i = 0
-        while i < len(self.alleles):
-            if len(self.alleles) <= 1:
-                self.alleles.pop(i)
-            else:
-                self.alleles[i] = self.alleles[i][1:]
-                i += 1
-        self.alleles.append("")
 
-class BamScanner:
-    """
-    Class to keep track of all the information read in from the bamfile/snpfile.
-    """
+    def write(self, file_handle):
+        sys.stderr.write("DISCARD reads:\n"
+                         "  improper pair: %d\n"
+                         "  different chromosome: %d\n"
+                         "  indel: %d\n"
+                         "  secondary alignment: %d\n"
+                         "  excess overlapping snps: %d\n"
+                         "  excess allelic combinations: %d\n"
+                         "  missing pairs (e.g. mismatched read names): %d\n"
+                         "KEEP reads:\n"
+                         "  single-end: %d\n"
+                         "  pairs: %d\n"
+                         "REMAP reads:\n"
+                         "  single-end: %d\n"
+                         "  pairs: %d\n" %
+                         (self.discard_improper_pair,
+                          self.discard_different_chromosome,
+                          self.discard_indel,
+                          self.discard_secondary,
+                          self.discard_excess_snps,
+                          self.discard_excess_reads,
+                          self.discard_missing_pair,
+                          self.keep_single,
+                          self.keep_pair,
+                          self.remap_single,
+                          self.remap_pair))
+
+        file_handle.write("read SNP ref matches: %d\n" % self.ref_count)
+        file_handle.write("read SNP alt matches: %d\n" % self.alt_count)
+        file_handle.write("read SNP mismatches: %d\n" % self.other_count)
+        
+        total = self.ref_count + self.alt_count + self.other_count
+        if total > 0:
+            mismatch_pct = 100.0 * float(self.other_count) / total
+            if mismatch_pct > 10.0:
+                sys.stderr.write("WARNING: many read SNP overlaps do not match "
+                                 "either allele (%.1f%%). SNP coordinates "
+                                 "in input file may be incorrect.\n" %
+                                 mismatch_pct)
+    
+
 
-    def __init__(self, is_paired_end, max_window, file_name, keep_file_name,
-                 remap_name, remap_num_name, fastq_names, snp_dir):
-        """
-        Constructor: opens files, creates initial table.
+
+def parse_options():
+    
+    parser = argparse.ArgumentParser(description="Looks for SNPs and indels "
+                                     "overlapping reads. If a read overlaps "
+                                     "SNPs, alternative versions of the read "
+                                     "containing different alleles are created "
+                                     "and written to files for remapping. "
+                                     "Reads that do not overlap SNPs or indels "
+                                     "are written to a 'keep' BAM file."
+                                     "Reads that overlap indels are presently "
+                                     "discarded.")
+                                   
+
+    parser.add_argument("--is_paired_end", "-p", action='store_true',
+                        dest='is_paired_end', 
+                        default=False,
+                        help=("Indicates that reads are paired-end "
+                              "(default is single)."))
+    
+    parser.add_argument("--is_sorted", "-s", action='store_true',
+                        dest='is_sorted', 
+                        default=False,
+                        help=('Indicates that the input BAM file'
+                              ' is coordinate-sorted (default '
+                              'is False).'))
+    
+    parser.add_argument("--max_seqs", type=int, default=MAX_SEQS_DEFAULT,
+                        help="The maximum number of sequences with different "
+                        "allelic combinations to consider remapping "
+                        "(default=%d). Read pairs with more allelic "
+                        "combinations than MAX_SEQs are discarded" %
+                        MAX_SEQS_DEFAULT)
+
+    parser.add_argument("--max_snps", type=int, default=MAX_SNPS_DEFAULT,
+                        help="The maximum number of SNPs allowed to overlap "
+                        "a read before discarding the read. Allowing higher "
+                        "numbers will decrease speed and increase memory "
+                        "usage (default=%d)."
+                         % MAX_SNPS_DEFAULT)
+    
+    parser.add_argument("--output_dir", default=None,
+                        help="Directory to write output files to. If not "
+                        "specified, output files are written to the "
+                        "same directory as the input BAM file.")
+
+    parser.add_argument("--snp_dir", action='store', 
+                        help="Directory containing SNP text files "
+                        "This directory should contain one file per "
+                        "chromosome named like chr<#>.snps.txt.gz. "
+                        "Each file should contain 3 columns: position "
+                        "RefAllele AltAllele. This option should "
+                        "only be used if --snp_tab, --snp_index, "
+                        "and --haplotype arguments are not used."
+                        " If this argument is provided, all possible "
+                        "allelic combinations are used (rather "
+                        "than set of observed haplotypes).",
+                        default=None)
         
-        Attributes:
-        -----------
 
-        is_paired_end : boolean
-            Boolean indicating whether input data are paired end.
+    parser.add_argument("--snp_tab",
+                        help="Path to HDF5 file to read SNP information "
+                        "from. Each row of SNP table contains SNP name "
+                        "(rs_id), position, allele1, allele2.",
+                        metavar="SNP_TABLE_H5_FILE",
+                        default=None)
+    
+    parser.add_argument("--snp_index",
+                        help="Path to HDF5 file containing SNP index. The "
+                        "SNP index is used to convert the genomic position "
+                        "of a SNP to its corresponding row in the haplotype "
+                        "and snp_tab HDF5 files.",
+                        metavar="SNP_INDEX_H5_FILE",
+                        default=None)
+    
+    parser.add_argument("--haplotype",
+                        help="Path to HDF5 file to read phased haplotypes "
+                        "from. When generating alternative reads "
+                        "use known haplotypes from this file rather "
+                        "than all possible allelic combinations.",
+                        metavar="HAPLOTYPE_H5_FILE",
+                        default=None)
+
+    parser.add_argument("--samples",
+                        help="Use only haplotypes and SNPs that are "
+                        "polymorphic in these samples. "
+                        "SAMPLES can either be a comma-delimited string "
+                        "of sample names or a path to a file with one sample "
+                        "name per line (file is assumed to be whitespace-"
+                        "delimited and first column is assumed to be sample "
+                        "name). Sample names should match those present in the "
+                        "--haplotype file. Samples are ignored if no haplotype "
+                        "file is provided.",
+                        metavar="SAMPLES")
+                        
+    parser.add_argument("bam_filename", action='store',
+                        help="Coordinate-sorted input BAM file "
+                        "containing mapped reads.")
+    
+        
+    options = parser.parse_args()
 
-        snp_dir : str
-            Path to directory that contains gzipped SNP files (one per
-            chromosome).
+    if options.snp_dir:
+        if(options.snp_tab or options.snp_index or options.haplotype):
+            parser.error("expected --snp_dir OR (--snp_tab, --snp_index and "
+                         "--haplotype) arguments but not both")
+    else:
+        if not (options.snp_tab and options.snp_index and options.haplotype):
+            parser.error("either --snp_dir OR (--snp_tab, "
+                         "--snp_index AND --haplotype) arguments must be "
+                         "provided")
+    
+    if options.samples and not options.haplotype:
+        # warn because no way to use samples if haplotype file not specified
+        sys.stderr.write("WARNING: ignoring --samples argument "
+                         "because --haplotype argument not provided")
 
-        bamfile : pysam.Samfile
-            Input bam file that we are reading.
+    return options
 
-        keep_bam : pysam.Samfile
-            Output bam file of reads that do not overlap SNPs.
 
-        remap_bam : pysam.Samfile
-            Output bam file of reads that do overlap SNPs and need to be
-            remapped.
 
-        remap_num_file : gzip.GzipFile
-            File to write XXX to.
 
-        fastqs : list
-            List of gzip.GzipFile objects for the different fastqs that will
-            contain the reads to remap.
 
-        read_table : list
-            List of lists. Sublist i contains the reads whose positions are
-            [real read position] % max_window.
 
-        cur_read : pysam.XXX
-            Current read from the bam file.
 
-        end_of_file : boolean
-            Boolean indicating whether we've reached the end of the bam file.
 
-        remap_num : int
-            A counter for the number of reads to be remapped. This starts at one
-            and is incremented when a read (pair) is written to the fastq
-            file(s). TODO: Is this supposed to start at one?
 
-        ref_match : int
-            This is incremented everytime a read sequence matches a SNP
-            genotype. Note that a particular read sequence can be looked at
-            multiple times if it has multiple SNPs, so this is somewhat hard to
-            interpret.
+def write_read(read, snp_tab, snp_idx, read_pos):
+    snp_allele1 = [' '] * read.qlen
+    snp_allele2 = [' '] * read.qlen
 
-        alt_match : int
-            This is initialized but not used anywhere.
+    for (s_idx, r_idx) in zip(snp_idx, read_pos):
+        a1 = snp_tab.snp_allele1[s_idx]
+        a2 = snp_tab.snp_allele2[s_idx]
 
-        no_match : int
-            This is incremented everytime a read sequence doesn't matche a SNP
-            genotype. Note that a particular read sequence can be looked at
-            multiple times if it has multiple SNPs, so this is somewhat hard to
-            interpret.
+        snp_allele1[r_pos-1] = a1
+        snp_allele2[r_pos-1] = a2
 
-        toss : int
-            Number of reads tossed.
+    sys.stderr.write("READ: %s\n" % read.query)
+    sys.stderr.write("A1:   %s\n" % "".join(snp_allele1))
+    sys.stderr.write("A2:   %s\n" % "".join(snp_allele2))
+    
+    
+        
 
-        nosnp : int
-            Number of reads with no SNPs. If one read in a read pair has a SNP
-            and the other doesn't, both "nosnp" and "remap" (below) will be
-            incremented by one.
 
-        remap : int
-            Number of reads to remap. If one read in a read pair has a SNP and
-            the other doesn't, both "nosnp" and "remap" (below) will be
-            incremented by one.
+def count_ref_alt_matches(read, read_stats, snp_tab, snp_idx, read_pos):
+    ref_alleles = snp_tab.snp_allele1[snp_idx]
+    alt_alleles = snp_tab.snp_allele2[snp_idx]
+    
+    for i in range(len(snp_idx)):
+        if ref_alleles[i] == read.query[read_pos[i]-1]:
+            # read matches reference allele
+            read_stats.ref_count += 1
+        elif alt_alleles[i] == read.query[read_pos[i]-1]:
+            # read matches non-reference allele
+            read_stats.alt_count += 1
+        else:
+            # read matches neither ref nor other
+            read_stats.other_count += 1
+            
 
-        tot : int
-            I think this is total number of reads, although it is only
-            incremented in empty_slot_single(self) so it doesn't seem to be
-            fully implemented right now.
 
-        printstats : boolean
-            Boolean for some print statements, currently not used.
+def get_unique_haplotypes(haplotypes, snp_idx):
+    """returns list of vectors of unique haplotypes for this set of SNPs"""
+    haps = haplotypes[snp_idx,:].T
+    
+    # create view of data that joins all elements of column
+    # into single void datatype
+    h = np.ascontiguousarray(haps).view(np.dtype((np.void, haps.dtype.itemsize * haps.shape[1])))
 
-        num_reads : int
-            Number of reads for a given window that we have read but not yet
-            written. This number is incremented when we read in a read and
-            decremented when we pop a read out of the read table.
+    # get index of unique columns
+    _, idx = np.unique(h, return_index=True)
 
-        window_too_small : int
-            The number of reads thrown out because their CIGAR contained a run
-            of N's longer than max_window.
+    return haps[idx,:]
+    
 
-        cur_snp : SNP
-            The current SNP to be or being parsed. 
 
-        pos : int
-            The current genomic position we are analyzing.
 
-        chr_num : int
-            Bam file ID number of the current chromosome we are analyzing.
+            
+def generate_haplo_reads(read_seq, snp_idx, read_pos, ref_alleles, alt_alleles,
+                         haplo_tab):
+    haps = get_unique_haplotypes(haplo_tab, snp_idx)
 
-        chr_name : str
-            Name of the current chromosome we are analyzing.
+    # sys.stderr.write("UNIQUE haplotypes: %s\n"
+    #                  "read_pos: %s\n"
+    #                 % (repr(haps), read_pos))
+    
+    read_len = len(read_seq)
+
+    new_read_list = []
+
+    # loop over haplotypes
+    for hap in haps:
+        new_read = []
+        cur_pos = 1
+
+        missing_data = False
+
+        # loop over the SNPs to get alleles that make up this haplotype
+        for i in range(len(hap)):
+            if read_pos[i] > cur_pos:
+                # add segment of read
+                new_read.append(read_seq[cur_pos-1:read_pos[i]-1])
+            # add segment for appropriate allele
+            if hap[i] == 0:
+                # reference allele
+                new_read.append(ref_alleles[i])
+            elif hap[i] == 1:
+                # alternate allele
+                new_read.append(alt_alleles[i])
+            else:
+                # haplotype has unknown genotype or phasing so skip it...
+                # not sure if this is the best thing to do, could instead
+                # assume ambiguity of this allele and generate reads with
+                # both possible alleles
+                missing_data = True
+                break
+            
+            cur_pos = read_pos[i] + 1
 
-        max_window : int
-            Size of the window in base pairs to process reads. All of the reads
-            and SNPs within max_window base pairs are processed at once. Any
-            junction-spanning reads (i.e. with N in the cigar) that extend
-            outside of the window are thrown out.
+        if read_len >= cur_pos:
+            # add final segment
+            new_read.append(read_seq[cur_pos-1:read_len])
+            
+        if not missing_data:
+            new_seq = "".join(new_read)
+
+            # sanity check: read should be same length as original
+            if len(new_seq) != read_len:
+                raise ValueError("Expected read len to be %d but "
+                                 "got %d.\n"
+                                 "ref_alleles: %s\n"
+                                 "alt_alleles: %s\n"
+                                 "read_pos: %s\n"
+                                 "snp_idx: %s\n"
+                                 "haps: %s\n" 
+                                 % (read_len, len(new_seq),
+                                    repr(ref_alleles), repr(alt_alleles),
+                                    repr(read_pos), repr(snp_idx),
+                                    repr(haps)))
+
+            new_read_list.append("".join(new_seq))
+
+    return new_read_list
 
-        """
-        
-        self.is_paired_end = is_paired_end
-        
-        # Read in all input files and create output files
-        self.snp_dir = snp_dir
-        self.bamfile = pysam.Samfile(file_name,"rb")
-        self.keep_bam = pysam.Samfile(keep_file_name, "wb",
-                                      template=self.bamfile)
-        self.remap_bam = pysam.Samfile(remap_name, "wb", template=self.bamfile)
-        self.remap_num_file = gzip.open(remap_num_name, "w")
-        self.fastqs = [gzip.open(fqn,"w") for fqn in fastq_names]
-        try:
-            self.cur_read = self.bamfile.next()
-        except:
-            sys.stderr.write("No lines available for input")
-            return()
-        self.end_of_file = False
-
-        self.remap_num = 1
-        self.ref_match = 0
-        self.alt_match = 0
-        self.no_match = 0
-        self.toss = 0
-        self.nosnp = 0
-        self.remap = 0
-        self.tot = 0
-        self.window_too_small = 0
-
-        self.printstats = True
-        
-        self.num_reads = 0
+    
 
-        self.cur_snp = None
-        
-        self.pos = self.cur_read.pos
-        self.chr_num = self.cur_read.tid
-        self.chr_name = self.bamfile.getrname(self.cur_read.tid)
-        self.max_window = max_window
+            
+def generate_reads(read_seq, read_pos, ref_alleles, alt_alleles, i):
+    """Recursively generate set of reads with all possible combinations
+    of alleles (i.e. 2^n combinations where n is the number of snps overlapping
+    the reads)
+    """
+    # TODO: this would use a lot less memory if re-implemented
+    # to not use recursion
+    
+    # create new version of this read with both reference and
+    # alternative versions of allele at this index
+    idx = read_pos[i]-1
+    ref_read = read_seq[:idx] + ref_alleles[i] + read_seq[idx+1:]
+    alt_read = read_seq[:idx] + alt_alleles[i] + read_seq[idx+1:]
+
+    if i == len(read_pos)-1:
+        # this was the last SNP
+        return [ref_read, alt_read]
+
+    # continue recursively with other SNPs overlapping this read
+    reads1 = generate_reads(ref_read, read_pos, ref_alleles, alt_alleles, i+1)
+    reads2 = generate_reads(alt_read, read_pos, ref_alleles, alt_alleles,  i+1)
+
+    return reads1 + reads2
                 
-        # Initialize the read tracking tables.
-        self.read_table = [[] for x in range(self.max_window)]
+
+
+def write_fastq(fastq_file, orig_read, new_seqs):
+    n_seq = len(new_seqs)
+    i = 1
+    for new_seq in new_seqs:
+        # Give each read a new name giving:
+        # 1 - the original name of the read
+        # 2 - the coordinate that it should map to
+        # 3 - the number of the read
+        # 4 - the total number of reads being remapped
+        name = "%s.%d.%d.%d" % (orig_read.qname, orig_read.pos+1, i, n_seq)
+                                       
+        fastq_file.write("@%s\n%s\n+%s\n%s\n" %
+                         (name, new_seq, name, orig_read.qual))
+
+        i += 1
+
         
-        # Initialize the SNP and indel tracking tables.
-        self.switch_chr()
+def write_pair_fastq(fastq_file1, fastq_file2, orig_read1, orig_read2,
+                     new_pairs):
+
+    n_pair = len(new_pairs)
+    i = 1
+    for pair in new_pairs:
+        # give each fastq record a new name giving:
+        # 1 - the original name of the read
+        # 2 - the coordinates the two ends of the pair should map to
+        # 3 - the number of the read
+        # 4 - the total number of reads being remapped
+
+        pos_str = "%d-%d" % (min(orig_read1.pos+1, orig_read2.pos+1),
+                             max(orig_read1.pos+1, orig_read2.pos+1))
         
-        # Fill all tables.
-        self.fill_table()
-     
-    def fill_table(self): 
-        """
-        Fills the table of reads starting from the current position and
-        extending for the next <max_window> base pairs. The read table is a
-        list of lists of length max_window. If the position of the current read
-        is 100, the first sublist contains all of the reads at position 100, the
-        next sublist contains all of the reads at position 101, etc.
-        """
-        if self.end_of_file:
-            return()
-            
-        # For first read we need to set self.pos and initialize the SNP table.
-        if self.num_reads == 0:
-            self.pos = self.cur_read.pos
-            self.init_snp_table()
-            #self.num_reads+=1000
-            
-        while (self.cur_read.tid == self.chr_num) and \
-          (self.cur_read.pos < (self.pos + self.max_window)):
-            self.num_reads += 1
-            self.read_table[self.cur_read.pos % self.max_window].append(self.cur_read)
-           
-            # Get a new read and check for the end of the file. 
-            try:
-                self.cur_read = self.bamfile.next()
-            except:
-                self.empty_table()
-                self.end_of_file = True
-                return()
-        
-        # Check to see if we've come across a new chromosome.
-        if self.cur_read.tid != self.chr_num:
-            self.empty_table()
-            self.chr_num = self.cur_read.tid
-            try:
-                self.chr_name = self.bamfile.getrname(self.chr_num)
-            except:
-                sys.stderr.write("Problem with tid: " + str(self.chr_num) + "\n")
-                self.skip_chr()
-            self.pos = self.cur_read.pos
-            self.switch_chr()
-            self.fill_table()
-
-
-    def switch_chr(self):
-        """Switches to looking for SNPs on next chromosome."""
-        chr_match = False
-        while not chr_match and not self.end_of_file:
-            try:
-                self.snpfile = gzip.open("%s/%s.snps.txt.gz" 
-                                         % (self.snp_dir,self.chr_name))
-                sys.stderr.write("Starting on chromosome " + self.chr_name+"\n")
-                chr_match = True
-            except:
-                sys.stderr.write("SNP file for chromosome " + 
-                                 self.chr_name + " is not found. Skipping these reads.\n")
-                self.skip_chr()
+        name = "%s.%s.%d.%d" % (orig_read1.qname, pos_str, i, n_pair)
         
-        self.end_of_snp_file = False
-        self.get_next_snp()
-
-    def init_snp_table(self):
-        """
-        Creates an empty SNP table starting from the position of the current
-        and extending max_window base pairs. The SNP table is max_window long
-        and has a zero if there are no variants overlapping a position or
-        contains a SNP object if there is variant that overlaps a given
-        position. 
+        fastq_file1.write("@%s\n%s\n+%s\n%s\n" %
+                          (name, pair[0], name, orig_read1.qual))
+
+        rev_seq = util.revcomp(pair[1])
+        fastq_file2.write("@%s\n%s\n+%s\n%s\n" %
+                          (name, rev_seq, name, orig_read2.qual))
+
+        i += 1
+                         
+
+
         
-        Also creates an indel table which is a list of lists of length
-        max_window.
-
-        Also creates an indel dict which is a dict whose keys are genomic
-        positions and whose values are SNP objects whose ptype is indel.
-        """
-        # Number of SNPs in this table. I think this is total number of
-        # different alleles across the whole table. I'm not exactly sure.
-        self.num_snps = 0
-        self.indel_dict = {}
-        self.snp_table = [0 for x in range(self.max_window)]
-        self.indel_table = [[] for x in range(self.max_window)]
-        # Get SNPs in this window but skip SNPs that are upstream of the current
-        # read.
-        while not self.end_of_snp_file and self.cur_snp.pos < self.pos:
-            self.get_next_snp()
-
-        # Add SNPs downstream of the current read and within the current window.
-        while not self.end_of_snp_file and (self.cur_snp.pos < self.pos + self.max_window):
-            if self.cur_snp.ptype == "snp":
-                self.add_snp()
-            else:
-                self.add_indel()
-            self.get_next_snp()
+    
+def filter_reads(files, max_seqs=MAX_SEQS_DEFAULT, max_snps=MAX_SNPS_DEFAULT,
+                 samples=None):
+    cur_chrom = None
+    cur_tid = None
+    seen_chrom = set([])
+
+    snp_tab = snptable.SNPTable()
+    read_stats = ReadStats()
+    read_pair_cache = {}
+    cache_size = 0
+    read_count = 0
+    
+    for read in files.input_bam:
+        read_count += 1
+        # if (read_count % 100000) == 0:
+        #     sys.stderr.write("\nread_count: %d\n" % read_count)
+        #     sys.stderr.write("cache_size: %d\n" % cache_size)
         
-    def add_snp(self):
-        """
-        Add a SNP to the SNP table. If the SNP table has a zero at this
-        position, the SNP object will replace the zero. If the SNP table
-        already has a SNP object at this position, the SNP will be added as new
-        alleles. 
-        """
-        cur_pos = self.cur_snp.pos % self.max_window
-        if self.snp_table[cur_pos] == 0:
-            self.num_snps += 1
-            self.snp_table[cur_pos] = self.cur_snp
-        elif isinstance(self.snp_table[cur_pos], SNP):
-            self.snp_table[cur_pos].add_allele(self.cur_snp.alleles)     
+        if (cur_tid is None) or (read.tid != cur_tid):
+            # this is a new chromosome
+            cur_chrom = files.input_bam.getrname(read.tid)
+
+            if len(read_pair_cache) != 0:
+                sys.stderr.write("WARNING: failed to find pairs for %d "
+                                 "reads on this chromosome\n" %
+                                 len(read_pair_cache))
+                read_stats.discard_missing_pair += len(read_pair_cache)
+            read_pair_cache = {}
+            cache_size = 0
+            read_count = 0
             
-    def add_indel(self):
-        """
-        Add an indel to the indel table and indel dict. If there is already an
-        indel in the indel dict at this position, add the alleles from cur_snp.
-        """
-        position = self.cur_snp.pos
-        if self.indel_dict.has_key(position):
-            start = self.indel_dict[position].max_len
-            self.indel_dict[position].add_allele(self.cur_snp.alleles)
-        else:
-            self.indel_dict[position] = self.cur_snp
-            start = 0
-        end = self.indel_dict[position].max_len
-        # max_len is the length of the longest allele for an indel and
-        # "position" is the genomic position of this indel. If the indel_dict
-        # already has an indel at this genomic position, we will append
-        # "position" to all of the positions/sublists in indel_table beyond the
-        # lenght of the indel that already exists. If there isn't already an
-        # indel in indel_table at this "position", we'll append "position" to
-        # all of the sublists in indel_table that are spanned by the indel.
-        i = start
-        while (i < end) and ((self.cur_snp.pos + i) < (self.pos + self.max_window)):
-            self.indel_table[(self.cur_snp.pos + i) % self.max_window].append(position)
-            i += 1
-
-    def get_next_snp(self):
-        """Read in next SNP (and set self.cur_snp) or signal end of file."""
-        snp_line = self.snpfile.readline()
-        if snp_line:
-            self.cur_snp = SNP(snp_line)
-        else:
-            self.end_of_snp_file = True
-
-    def skip_chr(self):
-        """Skips all of the reads from the chromosome of the current read and
-        moves on to the next chromosome. Used if the SNP file can't be
-        located."""
-        while self.cur_read.tid == self.chr_num:
-            try:
-                self.cur_read = self.bamfile.next()
-            except:
-                self.empty_table()
-                self.end_of_file = True
-                return
-
-        self.chr_num = self.cur_read.tid
-        try:
-            self.chr_name = self.bamfile.getrname(self.chr_num)
-        except:
-            sys.stderr.write("Problem with tid: " + str(self.chr_num) + "\n")
-            self.skip_chr()
-
-    def empty_slot_single(self):
-        """Processes all reads that map to the current position and
-        removes them from the read table Treats reads as single-end"""        
-        cur_slot = self.pos % self.max_window
-        while len(self.read_table[cur_slot]) > 0:
-            self.tot += 1
-            read = self.read_table[cur_slot].pop()
-            self.num_reads -= 1
-            seqs = self.check_for_snps(read, 0)
-            # num_seqs it the numbers of different sequences for this read which
-            # includes the original sequence as well as the different sequences
-            # with alternate alleles swapped in.
-            num_seqs = len(seqs)
-            if (num_seqs == 0) or (num_seqs > 10):
-                continue
-            if (num_seqs == 1):
-                self.keep_bam.write(read)
+            if cur_chrom in seen_chrom:
+                # sanity check that input bam file is sorted
+                raise ValueError("expected input BAM file to be sorted "
+                                 "but chromosome %s is repeated\n" % cur_chrom)
+            seen_chrom.add(cur_chrom)
+            cur_tid = read.tid
+            sys.stderr.write("starting chromosome %s\n" % cur_chrom)
+
+            # use HDF5 files if they are provided, otherwise use text
+            # files from SNP dir
+            if files.snp_tab_h5:
+                sys.stderr.write("reading SNPs from file '%s'\n" %
+                                 files.snp_tab_h5.filename)
+                snp_tab.read_h5(files.snp_tab_h5, files.snp_index_h5,
+                                files.hap_h5, cur_chrom, samples)
             else:
-                self.remap_num_file.write("%i\n" % (num_seqs - 1))
-                self.remap_num_file.flush()
-                self.remap_bam.write(read)
-                for seq in seqs[1:]:
-                    loc_line = "%i:%s:%i:%i" % (
-                        self.remap_num, 
-                        self.chr_name, 
-                        read.pos,
-                        num_seqs - 1)
-                    self.fastqs[0].write("@%s\n%s\n+%s\n%s\n" % (
-                        loc_line, 
-                        seq, 
-                        loc_line,
-                        read.qual))
-                self.remap_num += 1
-        self.shift_SNP_table()
-
-    def empty_slot_paired(self):
-        """Processes all reads that map to the current position and
-        removes them from the read table. Treats reads as paired-end."""
-        
-        cur_slot = self.pos % self.max_window
-
-        # While there are reads in this slot...
-        while len(self.read_table[cur_slot]) > 0:
-            # Pop the first read in the slot
-            read = self.read_table[self.pos % self.max_window].pop()
-            self.num_reads -= 1
-
-            # Figure out the matching read position
-            pair_chr_num = read.rnext 
-            pair_pos = read.mpos 
-            if (pair_chr_num != self.chr_num) or \
-              ((pair_pos - self.pos) > self.max_window):
-                continue
-
-            # Find the slot the matching read is in
-            pair_slot = pair_pos % self.max_window
-            for indx in range(len(self.read_table[pair_slot])):
-                if self.read_table[pair_slot][indx].qname.split(":")[-1] == read.qname.split(":")[-1]:
-                    pair_read = self.read_table[pair_slot].pop(indx)
-                    self.num_reads -= 1
-                    seq1s = self.check_for_snps(read, 0)
-                    seq2s = self.check_for_snps(pair_read, read.mpos - read.pos)
-                    num_seqs = len(seq1s)*len(seq2s)
-                    if (num_seqs == 0) or (num_seqs > 32):
-                        break
-                    if (num_seqs == 1):
-                        self.keep_bam.write(read)
-                        self.keep_bam.write(pair_read)
-                    else:
-                        self.remap_bam.write(read)
-                        self.remap_bam.write(pair_read)
-                        self.remap_num_file.write("%i\n" % (2*(num_seqs-1)))
-                        first = True
-                        for seq1 in seq1s:
-                            for seq2 in seq2s:
-                                if not first:
-                                    left_pos = min(read.pos, pair_read.pos)
-                                    right_pos = max(read.pos, pair_read.pos)
-                                    loc_line="%i:%s:%i:%i:%i" % (
-                                        self.remap_num,
-                                        self.chr_name,
-                                        left_pos,
-                                        right_pos,
-                                        num_seqs - 1)
-                                    self.fastqs[0].write("@%s\n%s\n+%s\n%s\n" % (
-                                        loc_line,
-                                        seq1,
-                                        loc_line,
-                                        read.qual))
-                                    self.fastqs[1].write("@%s\n%s\n+%s\n%s\n" % (
-                                        loc_line,
-                                        self.reverse_complement(seq2),
-                                        loc_line,
-                                        pair_read.qual))
-                                first=False
-                        self.remap_num+=1
-                    # Stop searching for the pair since it was found.
-                    break 
-        self.shift_SNP_table()
-
-    def check_for_snps(self, read, start_dist):
-        """
-        Checks a single aligned read for overlapping SNPs and creates
-        alternative sequences for remapping.
-
-        Parameters
-        ----------
-        read : pysam.AlignedRead
-            Read to check for SNPs in.
-
-        start_dist : int
-            I think this is the distance from the current position of the
-            BamScanner to the start of the read.
-
-        Returns
-        -------
-        seqs : list
-            List of read sequences. This first entry is the read sequence from
-            the bam file. Any subsequent sequences are the read sequence from
-            the bam file except one base that overlapped a SNP is switched to
-            the other allele. If the list is empty, the read overlaps an indel
-            or has a CIGAR character besides N or M so we throw it out.
-        """
-        indx = read.pos % self.max_window
-        # p keeps track of the number of read bases we've already analyzed. When
-        # p = length of the read, we are done with this read.
-        p = 0
-        # num_snps is the number of SNPs in this read.
-        num_snps = 0
-        # I think seg_len is the distance from the current position of the
-        # BamScanner to where we are 
-        seg_len = start_dist
-        seqs = [read.seq]
-        if start_dist > 0:
-            # has_junc indicates whether the read has an N in the CIGAR although
-            # this doesn't seem to be used anywhere.
-            has_junc = False
-        # read.cigar is a list of tuples. Each tuple has two entries. The first
-        # entry specifies the character in the cigar and the second entry
-        # specifies the length of that character. The values are
-        # M       BAM_CMATCH      0
-        # I       BAM_CINS        1
-        # D       BAM_CDEL        2
-        # N       BAM_CREF_SKIP   3
-        # S       BAM_CSOFT_CLIP  4
-        # H       BAM_CHARD_CLIP  5
-        # P       BAM_CPAD        6
-        # =       BAM_CEQUAL      7
-        # X       BAM_CDIFF       8
-        # So a tuple (0, 5) means five matches and (4, 2) means a soft clip of
-        # two.
-
-        # We'll go through each cigar tuple one at a time.
-        for cigar in read.cigar:
-            seg_len += cigar[1]
-            # Check whether this cigar segment is longer than the max window.
-            # This generally happens if there is a junction read longer than the
-            # max window.
-            if seg_len > self.max_window:
-                self.window_too_small += 1
-                return([])
-
-            if cigar[0] == 4:
-                # CIGAR indicates a soft-clipping
-                p = p + cigar[1]
-            elif cigar[0] == 0:
-                # CIGAR indicates a match alignment to the reference genome.
-                # Since there is a match, let's go through each matched base and
-                # see whether it contains a SNP.
-                for i in range(cigar[1]):  
-                    if len(self.indel_table[indx]) == 0:
-                        snp = self.snp_table[indx]
-                        if snp != 0:
-                            num_snps += 1
-                            if num_snps > 10:
-                                # If there are more than 10 snps overlapping,
-                                # throw out the read to prevent memory blow-up.
-                                # TODO: should we increment self.toss here?
-                                return([])
-                            for seq in list(seqs):
-                                matches = 0
-
-                                for geno in snp.alleles:
-                                    if seq[p] == geno:
-                                        matches += 1
-                                        for alt_geno in snp.alleles:
-                                            if not alt_geno == geno:
-                                                new_seq = (seq[:p] + alt_geno +
-                                                           seq[p+1:])
-                                                seqs.append(new_seq)
-                                if matches == 0:
-                                    self.no_match += 1
-                                else:
-                                    self.ref_match += 1
+                snp_filename = "%s/%s.snps.txt.gz" % (files.snp_dir, cur_chrom)
+                sys.stderr.write("reading SNPs from file '%s'\n" % snp_filename)
+                snp_tab.read_file(snp_filename)
+            
+            sys.stderr.write("processing reads\n")
+
+        if read.is_secondary:
+            # this is a secondary alignment (i.e. read was aligned more than
+            # once and this has align score that <= best score)
+            read_stats.discard_secondary += 1
+            continue
+
+        if read.is_paired:
+            if read.next_reference_name is None:
+                # other side of pair not mapped...
+                process_single_read(read, read_stats, files, snp_tab, max_seqs,
+                                    max_snps)
+            elif(read.next_reference_name == cur_chrom or
+                 read.next_reference_name == "="):
+                # other pair mapped to same chrom
+
+                # sys.stderr.write("flag: %s" % read.flag)
+                if not read.is_proper_pair:
+                    # sys.stderr.write(' => improper\n')
+                    read_stats.discard_improper_pair += 1
+                    continue
+                # sys.stderr.write(' => proper\n')
+
+                if read.qname in read_pair_cache:
+                    # we already saw prev pair, retrieve from cache
+                    read1 = read_pair_cache[read.qname]
+                    read2 = read
+                    del read_pair_cache[read.qname]
+                    cache_size -= 1
+
+                    if read2.next_reference_start != read1.reference_start:
+                        sys.stderr.write("WARNING: read pair positions "
+                                         "do not match for pair %s\n" %
+                                         read.qname)
                     else:
-                        # It's an indel, throw it out.
-                        self.toss += 1
-                        return([])
-                    indx = (indx + 1) % self.max_window
-                    p += 1
-            elif cigar[0] == 3:
-                # Skipped in the reference genome (splice junction).
-                indx = (indx + cigar[1]) % self.max_window
-                has_junc = True
+                        process_paired_read(read1, read2, read_stats,
+                                            files, snp_tab, max_seqs,
+                                            max_snps)
+                else:
+                    # we need to wait for next pair
+                    read_pair_cache[read.qname] = read
+
+                    cache_size += 1
+
+                    
             else:
-                # There is a non-N/M in the read CIGAR--throw out the read.
-                self.toss += 1
-                return([])
-                
-        if len(seqs) == 1:
-            self.nosnp += 1
+                # other side of pair mapped to different
+                # chromosome, discard this read
+                read_stats.discard_different_chromosome += 1
+
         else:
-            self.remap += 1
-        return seqs
+            process_single_read(read, read_stats, files, snp_tab,
+                                max_seqs, max_snps)
+
+    if len(read_pair_cache) != 0:
+        sys.stderr.write("WARNING: failed to find pairs for %d "
+                         "reads on this chromosome\n" %
+                         len(read_pair_cache))
+        read_stats.discard_missing_pair += len(read_pair_cache)
     
-    def shift_SNP_table(self):             
-        """Shifts the SNP table over one position and makes sure that
-        indels are not lost."""
+    read_stats.write(sys.stderr)
+                     
+
+def process_paired_read(read1, read2, read_stats, files,
+                        snp_tab, max_seqs, max_snps):
+    """Checks if either end of read pair overlaps SNPs or indels
+    and writes read pair (or generated read pairs) to appropriate 
+    output files"""
+
+    new_reads = []    
+    for read in (read1, read2):
+        # check if either read overlaps SNPs or indels
+        # check if read overlaps SNPs or indels
+        snp_idx, snp_read_pos, \
+            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)
         
-        self.pos += 1
-
-        # Current slot to fill is the position + max_window - 1
-        cur_slot=(self.pos-1) % self.max_window
+        if len(indel_idx) > 0:
+            # for now discard this read pair, we want to improve this to handle
+            # the indel reads appropriately
+            read_stats.discard_indel += 2
+            # TODO: add option to handle indels instead of throwing out reads
+            return
+
+        if len(snp_idx) > 0:
+            ref_alleles = snp_tab.snp_allele1[snp_idx]
+            alt_alleles = snp_tab.snp_allele2[snp_idx]
+
+            count_ref_alt_matches(read, read_stats, snp_tab, snp_idx,
+                                  snp_read_pos)
+
+            # limit recursion here by discarding reads that
+            # overlap too many SNPs
+            if len(snp_read_pos) > max_snps:
+                read_stats.discard_excess_snps += 1
+                return
 
-        # Delete indels that are no longer used (if they ended at the previous position)
-        for indel_pos in self.indel_table[cur_slot]:
-            if (indel_pos + self.indel_dict[indel_pos].max_len-1) == (self.pos-1):
-                del self.indel_dict[indel_pos]
-        
-        self.indel_table[cur_slot]=[]
-        
-        # Carry over indels from the previous slot
-        for indel_pos in self.indel_table[cur_slot-1]:
-            if (indel_pos + self.indel_dict[indel_pos].max_len-1) >= (self.pos+self.max_window-1):
-                self.indel_table[cur_slot].append(indel_pos)
-
-        if self.snp_table[cur_slot] != 0:
-            self.num_snps -= 1
-        self.snp_table[cur_slot] = 0
-
-        # See if there is a SNP overlapping the current spot.
-        while not self.end_of_snp_file and self.pos + self.max_window-1 > self.cur_snp.pos:
-            sys.stderr.write(str(self.num_snps) + " " + str(self.pos) + " " + 
-                             str(self.cur_snp.pos)+" !!!\n")
-            sys.stderr.write("SNP out of order has been skipped\n")
-            self.get_next_snp()
-
-        while not self.end_of_snp_file and (self.cur_snp.pos == (self.pos + self.max_window - 1)):
-            if self.cur_snp.ptype == "snp":
-                self.add_snp()
+            if files.hap_h5:
+                # generate reads using observed set of haplotypes
+                read_seqs = generate_haplo_reads(read.query, snp_idx,
+                                                 snp_read_pos,
+                                                 ref_alleles, alt_alleles,
+                                                 snp_tab.haplotypes)
             else:
-                self.add_indel()
-                if not self.cur_snp.pos in self.indel_table[cur_slot]:
-                    self.indel_table[cur_slot].append(cur_snp.pos)
-            self.get_next_snp()
-
-    def empty_table(self):
-        """Completely empties the read_table by repeatedly calling
-        empty_slot function"""
-        end_pos = self.pos + self.max_window
-        while self.pos < end_pos:
-            if self.is_paired_end:
-                self.empty_slot_paired()
-            else:
-                self.empty_slot_single()
-
-    def complement(self, letter):
-        if letter == 'A':
-            return('T')
-        elif letter == 'T':
-            return('A')
-        elif letter == 'C':
-            return('G')
-        elif letter == 'G':
-            return('C')
+                # generate all possible allelic combinations of reads
+                read_seqs = generate_reads(read.query, snp_read_pos,
+                                           ref_alleles, alt_alleles, 0)
+            
+            new_reads.append(read_seqs)
         else:
-            return(letter)
-
-    def reverse_complement(self, read):
-        # reverse = ""
-        # for letter in read:
-        #     reverse = self.complement(letter) + reverse
-        # return reverse
-        reverse = [self.complement(letter) for letter in list(read)]
-        reverse.reverse()
-        return ''.join(reverse)
+            # no SNPs or indels overlap this read
+            new_reads.append([])
+            
+    if len(new_reads[0]) == 0 and len(new_reads[1]) == 0:
+        # neither read overlapped SNPs or indels
+        files.keep_bam.write(read1)
+        files.keep_bam.write(read2)
+        read_stats.keep_pair += 1
+    else:
+        # add original version of both sides of pair
+        new_reads[0].append(read1.query)
+        new_reads[1].append(read2.query)
+
+        if len(new_reads[0]) + len(new_reads[1]) > max_seqs:
+            # quit now before generating a lot of read pairs
+            read_stats.discard_excess_reads += 2
+            return 
+
+        # collect all unique combinations of read pairs
+        unique_pairs = set([])
+        n_unique_pairs = 0
+        for new_read1 in new_reads[0]:
+            for new_read2 in new_reads[1]:
+                pair = (new_read1, new_read2)
+                if pair in unique_pairs:
+                    pass
+                else:
+                    n_unique_pairs += 1
+                    if n_unique_pairs > max_seqs:
+                        read_stats.discard_excess_reads += 2
+                        return
+                    unique_pairs.add(pair)
+
+        # remove original read pair, if present
+        orig_pair = (read1.query, read2.query)
+                                 
+        if orig_pair in unique_pairs:
+            unique_pairs.remove(orig_pair)
+            
+        # write read pair to fastqs for remapping
+        write_pair_fastq(files.fastq1, files.fastq2, read1, read2,
+                         unique_pairs)
+
+        # Write read to 'remap' BAM for consistency with previous
+        # implementation of script. Probably not needed and will result in
+        # BAM that is not coordinate sorted. Possibly remove this...
+        files.remap_bam.write(read1)
+        files.remap_bam.write(read2)
+        read_stats.remap_pair += 1
+        
+
+        
     
-    def run(self):
-        """Iterate through bam and SNP files and write output files."""
-        self.fill_table()
-        while not self.end_of_file:
-            if self.is_paired_end:
-                self.empty_slot_paired()
-            else:
-                self.empty_slot_single()
-            self.fill_table()
-     
-        if self.window_too_small > 0:
-            sys.stderr.write(
-                'Segment distance (from read pair and junction separation) was '
-                'too large for %d reads so those reads have been thrown out. '
-                'Consider increasing the max window '
-                'size.\n' % self.window_too_small)
-
-        sys.stderr.write("Finished!\n")
-        self.keep_bam.close()
-        self.remap_bam.close()
-        self.remap_num_file.close()
-        [x.close() for x in self.fastqs]
-
-
-
-def main(infile, snp_dir, max_window=MAX_WINDOW_DEFAULT,
-         is_paired_end=False, is_sorted=False):
-    name_split = infile.split(".")
+
     
-    if len(name_split) > 1:
-        pref = ".".join(name_split[:-1])
-    else:
-        pref = name_split[0]
+
+def process_single_read(read, read_stats, files, snp_tab, max_seqs,
+                        max_snps):
+    """Check if a single read overlaps SNPs or indels, and writes
+    this read (or generated read pairs) to appropriate output files"""
+                
+    # check if read overlaps SNPs or indels
+    snp_idx, snp_read_pos, \
+        indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)
+
     
-    if not is_sorted:
-        pysam.sort(infile, pref + ".sort")
-        infile = pref + ".sort"
-        sort_file_name = pref + ".sort.bam"
+    if len(indel_idx) > 0:
+        # for now discard this read, we want to improve this to handle
+        # the indel reads appropriately
+        read_stats.discard_indel += 1
+        # TODO: add option to handle indels instead of throwing out reads
+        return
+
+    if len(snp_idx) > 0:
+        ref_alleles = snp_tab.snp_allele1[snp_idx]
+        alt_alleles = snp_tab.snp_allele2[snp_idx]
+
+        count_ref_alt_matches(read, read_stats, snp_tab, snp_idx,
+                              snp_read_pos)
+
+        # limit recursion here by discarding reads that
+        # overlap too many SNPs
+        if len(snp_read_pos) > max_snps:
+            read_stats.discard_excess_snps += 1
+            return
+
+        if files.hap_h5:
+            read_seqs = generate_haplo_reads(read.query, snp_idx,
+                                             snp_read_pos,
+                                             ref_alleles, alt_alleles,
+                                             snp_tab.haplotypes)
+        else:
+            read_seqs = generate_reads(read.query,  snp_read_pos,
+                                       ref_alleles, alt_alleles, 0)
+
+        # make set of unique reads, we don't want to remap
+        # duplicates, or the read that matches original
+        unique_reads = set(read_seqs)
+        if read.query in unique_reads:
+            unique_reads.remove(read.query)
+        
+        if len(unique_reads) == 0:
+            # only read generated matches original read,
+            # so keep original
+            files.keep_bam.write(read)
+            read_stats.keep_single += 1
+        elif len(unique_reads) < max_seqs:
+            # write read to fastq file for remapping
+            write_fastq(files.fastq_single, read, unique_reads)
+
+            # write read to 'to remap' BAM
+            # this is probably not necessary with new implmentation
+            # but kept for consistency with previous version of script
+            files.remap_bam.write(read)
+            read_stats.remap_single += 1
+        else:
+            # discard read
+            read_stats.discard_excess_reads += 1
+            return
+
     else:
-        sort_file_name = infile
+        # no SNPs overlap read, write to keep file
+        files.keep_bam.write(read)
+        read_stats.keep_single += 1
+            
 
-    keep_file_name = pref + ".keep.bam"
-    remap_name = pref + ".to.remap.bam"
-    remap_num_name = pref + ".to.remap.num.gz"
 
-    if is_paired_end:
-        fastq_names = [pref + ".remap.fq1.gz",
-                       pref + ".remap.fq2.gz"]
-    else:
-        fastq_names = [pref + ".remap.fq.gz"]
 
-    bam_data = BamScanner(is_paired_end, max_window, 
-                          sort_file_name, keep_file_name, remap_name, 
-                          remap_num_name, fastq_names, snp_dir)
-    bam_data.run()
+def parse_samples(samples_str):
+    """Gets list of samples from --samples argument. This may be 
+    a comma-delimited string or a path to a file. If a file is provided 
+    then the first column of the file is assumed to be the sample name"""
 
+    if samples_str is None:
+        return None
+        
+    # first check if this is a path to a file
+    if os.path.exists(samples_str) and not os.path.isdir(samples_str):
+        samples = []
 
+        if samples_str.endswith(".gz"):
+            f = gzip.open(samples_str)
+        else:
+            f = open(samples_str)
 
+        for line in f:
+            # assume first token in line is sample name
+            samples.append(line.split()[0])
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    
-    parser.add_argument("-p", "--paired_end",
-                        action='store_true',
-                        dest='is_paired_end', default=False,
-                        help=('Indicates that reads are '
-                              'paired-end (default is single).'))
+        sys.stderr.write("read %d sample names from file '%s'\n" %
+                         (len(samples), samples_str))
+                    
+        f.close()
+    else:    
+        # otherwise assume comma-delimited string
+        if ("," not in samples_str and len(samples_str) > 15) \
+           or ("/" in samples_str):
+            sys.stderr.write("WARNING: --samples argument (%s) "
+                             "does not look like sample name "
+                             "but is not path to valid file. "
+                             "Assuming it is a sample name anyway."
+                             % samples_str)
+
+        samples = samples_str.split(",")
+        sys.stderr.write("SAMPLES: %s\n"% repr(samples))
+
+
+    return samples
+
+
+        
+def main(bam_filenames, is_paired_end=False,
+         is_sorted=False, max_seqs=MAX_SEQS_DEFAULT,
+         max_snps=MAX_SNPS_DEFAULT, output_dir=None,
+         snp_dir=None, snp_tab_filename=None,
+         snp_index_filename=None,
+         haplotype_filename=None, samples=None):
+
+    files = DataFiles(bam_filenames,  is_sorted, is_paired_end,
+                      output_dir=output_dir,
+                      snp_dir=snp_dir,
+                      snp_tab_filename=snp_tab_filename,
+                      snp_index_filename=snp_index_filename,
+                      haplotype_filename=haplotype_filename)
     
-    parser.add_argument("-s", "--sorted",
-                        action='store_true', dest='is_sorted',  default=False,
-                        help=('Indicates that the input bam file'
-                              ' is coordinate sorted (default is False)'))
-
-
-    mhelp = ('Changes the maximum window to search for SNPs.  The default is '
-             '%d base pairs.  Reads or read pairs that span more than this '
-             'distance (usually due to splice junctions) will be thrown out. '
-             'Increasing this window allows for longer junctions, but may '
-             'increase run time and memory requirements.' % MAX_WINDOW_DEFAULT)
-    parser.add_argument("-m", "--max_window",
-                        action='store', dest='max_window', type=int, 
-                        default=MAX_WINDOW_DEFAULT, help=mhelp)
+    filter_reads(files, max_seqs=max_seqs, max_snps=max_snps,
+                 samples=samples)
+
+    files.close()
     
-    parser.add_argument("infile", action='store', help=("Coordinate sorted bam "
-                        "file."))
-    snp_dir_help = ('Directory containing the SNPs segregating within the '
-                    'sample in question (which need to be checked for '
-                    'mappability issues).  This directory should contain '
-                    'sorted files of SNPs separated by chromosome and named: '
-                    'chr<#>.snps.txt.gz. These files should contain 3 columns: '
-                    'position RefAllele AltAllele')
     
-    parser.add_argument("snp_dir", action='store', help=snp_dir_help)
 
-    options = parser.parse_args()
-        
-    main(options.infile, options.snp_dir,
-         max_window=options.max_window,
-         is_paired_end=options.is_paired_end,
-         is_sorted=options.is_sorted)
+if __name__ == '__main__':
+    options = parse_options()
+
+    samples = parse_samples(options.samples)
+
+    sys.stderr.write("command line: %s\n" % " ".join(sys.argv))
+    
+    main(options.bam_filename,
+         is_paired_end=options.is_paired_end, is_sorted=options.is_sorted,
+         max_seqs=options.max_seqs, max_snps=options.max_snps,
+         output_dir=options.output_dir,
+         snp_dir=options.snp_dir,
+         snp_tab_filename=options.snp_tab,
+         snp_index_filename=options.snp_index,
+         haplotype_filename=options.haplotype,
+         samples=samples)
+         
     
diff --git a/mapping/get_as_counts.py b/mapping/get_as_counts.py
new file mode 100644
index 0000000..9f7a706
--- /dev/null
+++ b/mapping/get_as_counts.py
@@ -0,0 +1,325 @@
+import sys
+import argparse
+import numpy as np
+import pysam
+
+import util
+import snptable
+
+import tables
+
+import os
+
+def write_results(out_f, chrom_name, snp_tab, ref_matches,
+                  alt_matches, oth_matches, geno_sample):
+
+    haps = None
+    has_haps = False
+    
+    if geno_sample:
+        # get index for this sample in the haplotype table
+        samp_idx_dict = dict(zip(snp_tab.samples,
+                                 range(len(snp_tab.samples))))
+
+        if geno_sample in samp_idx_dict:
+            idx = samp_idx_dict[geno_sample]
+            geno_hap_idx = np.array([idx*2, idx*2+1], dtype=np.int)
+            haps = snp_tab.haplotypes[:,geno_hap_idx]
+            has_haps = True
+            sys.stderr.write("geno_hap_idx: %s\n" % repr(geno_hap_idx))
+        else:
+            sys.stderr.write("WARNING: sample %s is not present for "
+                             "chromosome %s\n" % (geno_sample, chrom_name))
+            haps = None
+            has_haps = False
+
+    for i in range(snp_tab.n_snp):
+        if has_haps:
+            geno_str = "%d|%d" % (haps[i, 0], haps[i, 1])
+        else:
+            geno_str = "NA"
+        out_f.write("%s %d %s %s %s %d %d %d\n" %
+                    (chrom_name, snp_tab.snp_pos[i],
+                     snp_tab.snp_allele1[i], snp_tab.snp_allele2[i],
+                     geno_str, ref_matches[i], alt_matches[i],
+                     oth_matches[i]))
+
+
+def write_header(out_f):
+    out_f.write("CHROM SNP.POS REF.ALLELE ALT.ALLELE GENOTYPE REF.COUNT "
+                "ALT.COUNT OTHER.COUNT\n")
+
+
+    
+def parse_samples(samples_str):
+    """Gets list of samples from --samples argument. This may be 
+    a comma-delimited string or a path to a file. If a file is provided 
+    then the first column of the file is assumed to be the sample name"""
+
+    if samples_str is None:
+        return None
+        
+    # first check if this is a path to a file
+    if os.path.exists(samples_str) and not os.path.isdir(samples_str):
+        samples = []
+
+        if samples_str.endswith(".gz"):
+            f = gzip.open(samples_str)
+        else:
+            f = open(samples_str)
+
+        for line in f:
+            # assume first token in line is sample name
+            samples.append(line.split()[0])
+
+        sys.stderr.write("read %d sample names from file '%s'\n" %
+                         (len(samples), samples_str))
+                    
+        f.close()
+    else:    
+        # otherwise assume comma-delimited string
+        if ("," not in samples_str and len(samples_str) > 15) \
+           or ("/" in samples_str):
+            sys.stderr.write("WARNING: --samples argument (%s) "
+                             "does not look like sample name "
+                             "but is not path to valid file. "
+                             "Assuming it is a sample name anyway."
+                             % samples_str)
+
+        samples = samples_str.split(",")
+        sys.stderr.write("SAMPLES: %s\n"% repr(samples))
+
+
+    return samples
+
+
+    
+
+
+def parse_options():
+    parser = argparse.ArgumentParser(description="This script outputs "
+                                     "allele-specific counts for SNPs, using "
+                                     "reads from the provided BAM file. "
+                                     "Currently indels are not output and "
+                                     "chromosomes with no mapped reads "
+                                     "are skipped. Output "
+                                     "is written to stdout, with a single "
+                                     "header row and the following "
+                                     "columns: <chromosome> <snp_position> "
+                                     "<ref_allele> <alt_allele> <genotype> "
+                                     "<ref_allele_count> <alt_allele_count> "
+                                     "<other_count>. Reads that overlap "
+                                     "multiple SNPs will be counted multiple "
+                                     "times in the output (this behavior "
+                                     "differs from the "
+                                     "extract_haplotype_read_counts.py "
+                                     "script).")
+
+
+    parser.add_argument("--snp_dir", action='store', 
+                        help=("Directory containing SNP text files "
+                              "This directory should contain one file per "
+                              "chromosome named like chr<#>.snps.txt.gz. "
+                              "Each file should contain 3 columns: position "
+                              "RefAllele AltAllele"),
+                        default=None)
+        
+
+    parser.add_argument("--snp_tab",
+                        help="Path to HDF5 file to read SNP information "
+                        "from. Each row of SNP table contains SNP name "
+                        "(rs_id), position, allele1, allele2.",
+                        metavar="SNP_TABLE_H5_FILE",
+                        default=None)
+    
+    parser.add_argument("--snp_index",
+                        help="Path to HDF5 file containing SNP index. The "
+                        "SNP index is used to convert the genomic position "
+                        "of a SNP to its corresponding row in the haplotype "
+                        "and snp_tab HDF5 files.",
+                        metavar="SNP_INDEX_H5_FILE",
+                        default=None)
+    
+    parser.add_argument("--haplotype",
+                        help="Path to HDF5 file to read phased haplotypes "
+                        "from. When generating alternative reads "
+                        "use known haplotypes from this file rather "
+                        "than all possible allelic combinations.",
+                        metavar="HAPLOTYPE_H5_FILE",
+                        default=None)
+
+    parser.add_argument("--samples",
+                        help="Use only haplotypes and SNPs that are "
+                        "polymorphic in these samples. "
+                        "SAMPLES can either be a comma-delimited string "
+                        "of sample names or a path to a file with one sample "
+                        "name per line (file is assumed to be "
+                        "whitespace-delimited and first column is assumed to "
+                        "be sample name). Sample names should match those "
+                        "present in the haplotype HDF5 file. Samples are "
+                        "ignored if no haplotype file is provided.",
+                        metavar="SAMPLES", default=None)
+
+
+    parser.add_argument("--genotype_sample",
+                        metavar="GENO_SAMPLE",
+                        help="output genotypes for sample with name "
+                        "GENO_SAMPLE alongside allele-specific counts. "
+                        "GENO_SAMPLE must match one "
+                        "of the names present in the haplotype HDF5 file. "
+                        "If the --samples argument is provided then "
+                        "GENO_SAMPLE must also be one of the specified "
+                        "samples. If --genotype_sample is "
+                        "not provided or the GENO_SAMPLE does not match any "
+                        "of the samples in haplotype file then NA is "
+                        "output for genotype.", default=None)
+        
+    parser.add_argument("bam_filename", action='store',
+                        help="Coordinate-sorted input BAM file "
+                        "containing mapped reads.")
+
+
+    options = parser.parse_args()
+    
+    if options.snp_dir:
+        if(options.snp_tab or options.snp_index or options.haplotype):
+            parser.error("expected --snp_dir OR (--snp_tab, --snp_index and "
+                         "--haplotype) arguments but not both")
+    else:
+        if not (options.snp_tab and options.snp_index and options.haplotype):
+            parser.error("either --snp_dir OR (--snp_tab, "
+                         "--snp_index AND --haplotype) arguments must be "
+                         "provided")
+     
+    return options
+                        
+    
+
+def main(bam_filename, snp_dir=None, snp_tab_filename=None,
+         snp_index_filename=None, haplotype_filename=None, samples=None,
+         geno_sample=None):
+
+    out_f = sys.stdout
+    
+    bam = pysam.Samfile(bam_filename)
+        
+    cur_chrom = None
+    cur_tid = None
+    seen_chrom = set([])
+
+    snp_tab = snptable.SNPTable()
+    read_pair_cache = {}
+
+    # keep track of number of ref matches, non-ref matches, and other
+    # for each SNP
+    snp_ref_match = None
+    snp_alt_match = None
+    snp_other_match = None
+
+    
+    if geno_sample and not haplotype_filename:
+        sys.stderr.write("WARNING: cannot obtain genotypes for sample "
+                         "%s without --haplotype argument\n")
+        geno_sample = None
+
+    sys.stderr.write("GENOTYPE_SAMPLE: %s\n" % geno_sample)
+
+    if snp_tab_filename:
+        if (not snp_index_filename) or (not haplotype_filename):
+            raise ValueError("--snp_index and --haplotype must be provided "
+                             "if --snp_tab is provided")
+        snp_tab_h5 = tables.openFile(snp_tab_filename, "r")
+        snp_index_h5 = tables.openFile(snp_index_filename, "r")
+        hap_h5 = tables.openFile(haplotype_filename, "r")
+    else:
+        snp_tab_h5 = None
+        snp_index_h5 = None
+        hap_h5 = None
+        
+    for read in bam:
+        if (cur_tid is None) or (read.tid != cur_tid):
+            # this is a new chromosome
+
+            if cur_chrom:
+                # write out results from last chromosome
+                write_results(out_f, cur_chrom, snp_tab, snp_ref_match,
+                              snp_alt_match, snp_oth_match, geno_sample)
+            
+            cur_chrom = bam.getrname(read.tid)
+            
+            if cur_chrom in seen_chrom:
+                # sanity check that input bam file is sorted
+                raise ValueError("expected input BAM file to be sorted "
+                                 "but chromosome %s is repeated\n" % cur_chrom)
+            seen_chrom.add(cur_chrom)
+            cur_tid = read.tid
+            sys.stderr.write("starting chromosome %s\n" % cur_chrom)
+
+            # read SNPs for next chromomsome
+            if snp_tab_h5:
+                # read SNPs from HDF5 files, reduce to set that are
+                # polymorphic in specified samples
+                snp_tab.read_h5(snp_tab_h5, snp_index_h5, hap_h5,
+                                cur_chrom, samples=samples)
+            elif snp_dir:
+                # read SNPs from text file
+                snp_filename = "%s/%s.snps.txt.gz" % (snp_dir, cur_chrom)
+                snp_tab.read_file(snp_filename)
+            else:
+                raise ValueError("--snp_dir OR (--snp_tab, --snp_index, "
+                                 "and --hap_h5) must be defined")
+
+            sys.stderr.write("read %d SNPs\n" % snp_tab.n_snp)
+            
+            # clear SNP table and results             
+            snp_ref_match = np.zeros(snp_tab.n_snp, dtype=np.int16)
+            snp_alt_match = np.zeros(snp_tab.n_snp, dtype=np.int16)
+            snp_oth_match = np.zeros(snp_tab.n_snp, dtype=np.int16)
+                
+                 
+        if read.is_secondary:
+            # this is a secondary alignment (i.e. read was aligned more than
+            # once and this has align score that <= best score)
+            continue
+
+        # loop over all SNP that overlap this read
+        snp_idx, snp_read_pos, \
+            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)
+
+        for snp_i, read_pos in zip(snp_idx, snp_read_pos):
+            snp_pos = snp_tab.snp_pos[snp_i]
+            ref_allele = snp_tab.snp_allele1[snp_i]
+            alt_allele = snp_tab.snp_allele2[snp_i]
+            
+            if ref_allele == read.query[read_pos-1]:
+                snp_ref_match[snp_i] += 1
+            elif alt_allele == read.query[read_pos-1]:
+                snp_alt_match[snp_i] += 1
+            else:
+                snp_oth_match[snp_i] += 1
+
+    if cur_chrom:
+        # write results for final chromosome
+        write_results(out_f, cur_chrom, snp_tab, snp_ref_match,
+                      snp_alt_match, snp_oth_match, geno_sample)
+
+
+    
+
+
+if __name__ == "__main__":
+    sys.stderr.write("command: %s\n" % " ".join(sys.argv))
+
+    options = parse_options()
+    samples = parse_samples(options.samples)
+
+    
+    main(options.bam_filename, 
+         snp_dir=options.snp_dir,
+         snp_tab_filename=options.snp_tab,
+         snp_index_filename=options.snp_index,
+         haplotype_filename=options.haplotype,
+         samples=samples, geno_sample=options.genotype_sample)
+    
+
+    
diff --git a/mapping/rmdup_pe.py b/mapping/rmdup_pe.py
index 10c23df..fad63a1 100644
--- a/mapping/rmdup_pe.py
+++ b/mapping/rmdup_pe.py
@@ -1,106 +1,260 @@
-from random import choice
+import random
 import pysam
 import os
 import sys
 import argparse
 
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('input_bam', help="input BAM or SAM file (must be sorted!)")
-    parser.add_argument("output_bam", help="output BAM or SAM file")
-    
-    max_window=10000
-    options = parser.parse_args()
 
-    if options.input_bam.endswith(".sam") or options.input_bam.endswith("sam.gz"):
-        infile = pysam.Samfile(options.input_bam, "r")
+
+class ReadStats(object):
+
+    def __init__(self):
+        # number of reads discarded because not proper pair
+        self.discard_improper_pair = 0
+
+        # paired reads map to different chromosomes
+        self.discard_different_chromosome = 0
+
+        # number of reads discarded because secondary match
+        self.discard_secondary = 0
+
+        # reads where we expected to see other pair, but it was missing
+        # possibly due to read-pairs with different names
+        self.discard_missing_pair = 0
+
+        # reads with only one paired mapped
+        self.discard_single = 0
+
+        # reads discarded because duplicated
+        self.discard_dup = 0
+        
+        # number of read pairs kept
+        self.keep_pair = 0
+        
+
+    def write(self, file_handle):
+        sys.stderr.write("DISCARD reads:\n"
+                         "  improper pair: %d\n"
+                         "  different chromosome: %d\n"
+                         "  secondary alignment: %d\n"
+                         "  missing pairs (e.g. mismatched read names): %d\n"
+                         "  not paired: %d\n"
+                         "  duplicate pairs: %d\n"
+                         "KEEP reads:\n"
+                         "  pairs: %d\n"  %
+                         (self.discard_improper_pair,
+                          self.discard_different_chromosome,
+                          self.discard_secondary,
+                          self.discard_missing_pair,
+                          self.discard_single,
+                          self.discard_dup,
+                          self.keep_pair))
+        
+
+                
+
+
+def main(input_bam, output_bam):
+    if input_bam.endswith(".sam") or input_bam.endswith("sam.gz"):
+        infile = pysam.Samfile(input_bam, "r")
     else:
         # assume binary BAM file
-        infile = pysam.Samfile(options.input_bam, "rb")
+        infile = pysam.Samfile(input_bam, "rb")
 
-    if options.output_bam.endswith(".sam"):
+    if output_bam.endswith(".sam"):
         # output in text SAM format
-        outfile = pysam.Samfile(options.output_bam, "w", template=infile)
-    elif options.output_bam.endswith(".bam"):
+        outfile = pysam.Samfile(output_bam, "w", template=infile)
+    elif output_bam.endswith(".bam"):
         # output in binary compressed BAM format
-        outfile = pysam.Samfile(options.output_bam, "wb", template=infile)
+        outfile = pysam.Samfile(output_bam, "wb", template=infile)
     else:
         raise ValueError("name of output file must end with .bam or .sam")
 
-    readf=Read_filter(infile,outfile,max_window)
+    filter_reads(infile, outfile)
+
     infile.close()
     outfile.close()
 
-class Read_filter:
-    def __init__(self,infile,outfile,max_window):
-        self.read_table=[[] for x in range(max_window)]
-        self.num_reads=0
-        self.cur_pos=0
-        self.chr=""
-        self.cur_read=infile.next()
-        self.infile=infile
-        self.outfile=outfile
-        self.max_window=max_window
-        self.finished=False
-        while not self.finished:
-            self.fill_table()
-            self.empty_slot()
-        self.empty_table()
-
-    def fill_table(self):
-        if self.cur_read.rname != self.chr:
-            self.empty_table()
-            self.chr=self.cur_read.rname
-
-        if self.num_reads==0:
-            self.cur_pos=self.cur_read.pos
-        while not self.finished and self.cur_read.rname==self.chr and self.cur_read.pos<self.cur_pos+self.max_window:
-            self.read_table[self.cur_read.pos % self.max_window].append(self.cur_read)
-            self.num_reads+=1
-            try:
-                self.cur_read=self.infile.next()
-            except:
-                self.finished=True
-        
-    def empty_table(self):
-        while self.num_reads>0:
-            self.empty_slot()
 
-    def empty_slot(self):
-        ends=dict()
+
+def update_read_cache(cur_by_mpos, keep_cache, discard_cache,
+                      read_stats, outfile):
+    for mpos, read_list in cur_by_mpos.items():
+        # only keep one read from list with same pos,mate_pos pair
+        # shuffle order of reads in list and take first
+        # as 'keep' read
+        random.shuffle(read_list)
+        keep_read = read_list.pop()
+        if keep_read.qname in keep_cache:
+            raise ValueError("read %s is already "
+                             "in keep cache" % keep_read.qname)
+        keep_cache[keep_read.qname] = keep_read
+
+        # rest of reads get discarded
+        for discard_read in read_list:
+            # corner case: if reads are completely overlapping
+            # (same start pos) then we either want to keep both
+            # or discard both right now
+            if discard_read.qname in discard_cache:
+                # discard both reads from pair
+                del discard_cache[discard_read.qname]
+            elif discard_read.qname == keep_read.qname:
+                # keep both reads from pair
+                read_stats.keep_pair += 1
+                outfile.write(keep_read)
+                outfile.write(discard_read)
+                del keep_cache[keep_read.qname]
+            else:
+                discard_cache[discard_read.qname] = discard_read
+
+    
+def filter_reads(infile, outfile):
+    read_stats = ReadStats()
+    
+    cur_tid = None
+    seen_chrom = set([])
+
+    # name of reads to keep
+    keep_cache = {}
+    # name of reads to discard
+    discard_cache = {}
+    cur_by_mpos = {}
+
+    read_count = 0
+
+    # current position on chromosome
+    cur_pos = None
+    # lists of reads at current position,
+    # grouped by the mate pair position
+    cur_by_mpos = {}
+    
+    for read in infile:
+        read_count += 1
+                            
+        if (cur_tid is None) or (read.tid != cur_tid):
+            # this is a new chromosome
+            cur_chrom = infile.getrname(read.tid)
+
+            if cur_pos:
+                update_read_cache(cur_by_mpos, keep_cache, discard_cache,
+                                  read_stats, outfile)
+            
+            if len(keep_cache) + len(discard_cache) != 0:
+                sys.stderr.write("WARNING: failed to find pairs for %d "
+                                 "reads on this chromosome\n" %
+                                 (len(keep_cache) + len(discard_cache)))
+                read_stats.discard_missing_pair += len(keep_cache) + \
+                                                   len(discard_cache)
+                
+                sys.stderr.write("keep_cache:\n")
+                for r in keep_cache.values():
+                    sys.stderr.write("  %s\n" % r.qname)
+                sys.stderr.write("discard_cache:\n")
+                for r in discard_cache.values():
+                    sys.stderr.write("  %s\n" % r.qname)
+                                    
+            keep_cache = {}
+            discard_cache = {}
+            cur_pos = None
+            cur_by_mpos = {}
+            read_count = 0
+            
+            if cur_chrom in seen_chrom:
+                # sanity check that input bam file is sorted
+                raise ValueError("expected input BAM file to be sorted "
+                                 "but chromosome %s is repeated\n" % cur_chrom)
+            seen_chrom.add(cur_chrom)
+            cur_tid = read.tid
+            sys.stderr.write("starting chromosome %s\n" % cur_chrom)
+            sys.stderr.write("processing reads\n")
+
+        if read.is_secondary:
+            # this is a secondary alignment (i.e. read was aligned more than
+            # once and this has align score that <= best score)
+            read_stats.discard_secondary += 1
+            continue
+
+        if (not read.is_paired) or (read.next_reference_name is None):
+            read_stats.discard_single += 1
+            continue
+
+        if (read.next_reference_name != cur_chrom) and \
+           (read.next_reference_name != "="):
+            # other side of pair mapped to different chromosome
+            read_stats.discard_different_chromosome += 1
+            continue
+
+        if not read.is_proper_pair:
+            read_stats.discard_improper_pair += 1
+            continue
+
+        if (cur_pos is not None) and (read.pos < cur_pos):
+            raise ValueError("expected input BAM file to be sorted "
+                             "but reads are out of order")
         
-        for read in self.read_table[self.cur_pos % self.max_window]:
-            mate_pos=read.mpos
-            if mate_pos in ends:
-                ends[mate_pos].append(read)
+        if cur_pos is None or read.pos > cur_pos:
+            # we have advanced to a new start position
+            # decide which of reads at last position to keep or discard
+            update_read_cache(cur_by_mpos, keep_cache, discard_cache,
+                              read_stats, outfile)
+
+            # create new list of reads at current position
+            cur_pos = read.pos
+            cur_by_mpos = {}
+
+        if read.qname in keep_cache:
+            # we already saw prev side of pair, retrieve from cache
+            read1 = keep_cache[read.qname]
+            read2 = read
+            del keep_cache[read.qname]
+
+            if read2.next_reference_start != read1.reference_start:
+                sys.stderr.write("WARNING: read pair positions "
+                                 "do not match for pair %s\n" % read.qname)
+
+            read_stats.keep_pair += 1
+            outfile.write(read1)
+            outfile.write(read2)
+            
+        elif read.qname in discard_cache:
+            # we already saw prev side of pair, but decided to discard
+            # because read duplicated
+            del discard_cache[read.qname]
+            read_stats.discard_dup += 1
+
+        else:
+            # we have not seen other side of this read yet
+            # add read to list of those at current position
+            # grouping by mate-pair position
+            if read.mpos in cur_by_mpos:
+                cur_by_mpos[read.mpos].append(read)
             else:
-                ends[mate_pos]=[read]
+                cur_by_mpos[read.mpos] = [read]
+
+    # final update of read cache is just to cache strange corner case
+    # where final read pair on chromosome were overlapping (same start pos)
+    if cur_pos:
+        update_read_cache(cur_by_mpos, keep_cache, discard_cache,
+                          read_stats, outfile)
+                                 
+    if (len(keep_cache) + len(discard_cache)) != 0:
+        sys.stderr.write("WARNING: failed to find pairs for %d "
+                         "keep reads and %d discard reads on this "
+                         "chromosome\n" % (len(keep_cache), len(discard_cache)))
         
-        self.num_reads-=len(self.read_table[self.cur_pos % self.max_window])
-        self.read_table[self.cur_pos % self.max_window] = []
+        read_stats.discard_missing_pair += len(keep_cache) + len(discard_cache)
+
+    read_stats.write(sys.stderr)
+    
         
-        for end_key in ends:
-            read_list=ends[end_key]
-            if read_list[0].mpos < self.cur_pos:
-                continue
-            
-            while len(read_list)>0:
-                keep_indx=choice(range(len(read_list)))
-                keep_read=read_list.pop(keep_indx)
-                read_name=keep_read.qname
-                mate_pos=keep_read.mpos
-                found=False
-                for mate_read in self.read_table[mate_pos % self.max_window]:
-                    #sys.stderr.write("%s\t%s\n" % (mate_read.qname,keep_read.qname))
-                    if mate_read.qname==keep_read.qname:
-                        self.outfile.write(keep_read)
-                        self.outfile.write(mate_read)
-                        found=True
-                        break
-        if self.num_reads>0:
-            while len(self.read_table[self.cur_pos % self.max_window])==0:
-                self.cur_pos+=1
-
-
-
-main()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input_bam', help="input BAM or SAM file (must "
+                        "be sorted!)")
+    parser.add_argument("output_bam", help="output BAM or SAM file (not "
+                        "sorted!)")
+    
+    options = parser.parse_args()
+    
+    main(options.input_bam, options.output_bam)
diff --git a/mapping/run_snakemake.sh b/mapping/run_snakemake.sh
new file mode 100755
index 0000000..d4bf690
--- /dev/null
+++ b/mapping/run_snakemake.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Run snakemake on an SGE cluster (that uses qsub command for submission)
+# Run at most 20 jobs at a time
+# To run on an LSF cluster, change "qsub -V" to "bsub"
+snakemake --cluster "qsub -V" --jobs 20 --rerun-incomplete
diff --git a/mapping/sim_reads/run_sim_pe_reads.sh b/mapping/sim_reads/run_sim_pe_reads.sh
new file mode 100755
index 0000000..6b20b35
--- /dev/null
+++ b/mapping/sim_reads/run_sim_pe_reads.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+WASP=$HOME/proj/WASP
+
+python sim_pe_reads.py --seq $WASP/test_data/seq.h5 \
+       --n_reads 10000 \
+       --read_len 36 \
+       --hap_file $WASP/example_data/genotypes/chr22.hg19.haplotype.txt.gz \
+       --out_fastq1 $WASP/example_data/sim_pe_reads1.fastq.gz \
+       --out_fastq2 $WASP/example_data/sim_pe_reads2.fastq.gz
+
diff --git a/mapping/sim_reads/sim_pe_reads.py b/mapping/sim_reads/sim_pe_reads.py
new file mode 100644
index 0000000..40e7e83
--- /dev/null
+++ b/mapping/sim_reads/sim_pe_reads.py
@@ -0,0 +1,349 @@
+
+
+import argparse
+import gzip
+import numpy as np
+import numpy.random
+import sys
+import string
+
+import tables
+
+
+
+class Haplotypes(object):
+    def __init__(self, pos_array, ref_allele_array, alt_allele_array,
+                 hap1_array, hap2_array):
+        self.pos = pos_array
+        self.ref_allele = ref_allele_array
+        self.alt_allele = alt_allele_array
+        self.hap1 = hap1_array
+        self.hap2 = hap2_array
+
+        
+
+class ReadCoord(object):
+    def __init__(self, chrom_name, left_start, left_end, right_start, right_end):
+
+        if left_start > left_end:
+            raise ValueError("left_start must be <= left_end")
+        if left_start > right_start:
+            raise ValueError("left_start must be <= right_start")
+        if right_start > right_end:
+            raise ValueError("right_start must be <= right_end")
+
+        self.chrom_name = chrom_name
+        self.left_start = left_start
+        self.left_end = left_end
+        self.right_start = right_start
+        self.right_end = right_end
+        
+        
+
+dna_comp = None
+
+def comp(seq_str):
+    """complements the provided DNA sequence and returns it"""
+    global dna_comp
+
+    if dna_comp is None:
+        dna_comp = string.maketrans("ATCGMRWSYKNatcgmrwsykn",
+                                    "TAGCKYWSRMNtagckywsrmn")
+    return seq_str.translate(dna_comp)
+
+
+def revcomp(seq_str):
+    """returns reverse complement of provided DNA sequence"""
+    return comp(seq_str)[::-1]
+
+    
+
+def parse_options():
+    parser=argparse.ArgumentParser(
+        description="Simulates paired-end reads that can be used to "
+        "test the mapping pipeline")
+
+    parser.add_argument("--seq", required=True,
+                        help="Path to HDF5 file containing "
+                        "genome sequence. (Can be created "
+                        "using fasta2h5 program)",
+                        metavar="SEQ_H5_FILE")
+    
+    parser.add_argument("--n_reads", action='store',
+                        required=True, type=int,
+                        help="number of reads to simulate")
+
+    parser.add_argument("--out_fastq1", action='store',
+                        required=True,
+                        help="output file to write read1 to")
+
+    parser.add_argument("--out_fastq2", action='store',
+                        required=True,
+                        help="output file to write read2 to")
+
+    parser.add_argument("--read_len", default=36, 
+                        type=int)
+
+    parser.add_argument("--insert_size_mean", default=100.0,
+                        help="mean insert size (assumed to be normally distributed)",
+                        type=float)
+    
+    parser.add_argument("--insert_size_sd", default=50.0,
+                        help="standard devaition of insert size",
+                        type=float)
+
+    parser.add_argument("--chrom", default="chr22",
+                        help="for now just simulate from one chrom"
+                        "in future may simulate uniformly across entire genome...")
+
+    parser.add_argument("--hap_file", required=True,
+                        help="path to file containing haplotypes and alleles "
+                        "The file should contain 5 columns:\n"
+                        "  position RefAllele AltAllele hap1 hap2\n"
+                        "  example: 16050984 C G 1 0")
+
+    
+    return parser.parse_args()
+
+
+
+
+
+def read_haps(hap_file):
+    if hap_file.endswith(".gz"):
+        f = gzip.open(hap_file)
+    else:
+        f = open(hap_file)
+
+    pos_list = []
+    ref_allele_list = []
+    alt_allele_list = []
+    hap1_list = []
+    hap2_list = []
+    
+    for line in f:
+        words = line.rstrip().split()
+        ref_allele = words[1].upper()
+        alt_allele = words[2].upper()
+
+        # ignore indels for now
+        if ref_allele not in ("A", "C", "T", "G"):
+            continue
+        if alt_allele not in ("A", "C", "T", "G"):
+            continue
+
+        pos_list.append(int(words[0]))
+        
+        # append ascii code instead of char, as this is how sequence
+        # is represented in HDF5 files
+        ref_allele_list.append(ord(ref_allele))
+        alt_allele_list.append(ord(alt_allele))
+
+        hap1_list.append(int(words[3]))
+        hap2_list.append(int(words[4]))
+
+    pos_array = np.array(pos_list, dtype=np.int32)
+    ref_allele_array = np.array(ref_allele_list, dtype=np.uint8)
+    alt_allele_array = np.array(alt_allele_list, dtype=np.uint8)
+    hap1_array = np.array(hap1_list, dtype=np.uint8)
+    hap2_array = np.array(hap2_list, dtype=np.uint8)
+
+    haplotypes = Haplotypes(pos_array, ref_allele_array, alt_allele_array,
+                            hap1_array, hap2_array)
+    
+    f.close()
+
+    return haplotypes
+
+
+    
+
+def write_reads(file1, file2, read_coord, left_seq, right_seq):
+    """Outputs PE reads to two separate files in fastq format"""
+
+    if len(right_seq) != len(left_seq):
+        raise ValueError("length of right and left seqs does not match")
+    
+    read_len = len(left_seq)
+        
+    qual_str = "h" * read_len
+
+    tile = np.random.randint(32767)
+    
+    # use chromosome number as lane number
+    # random identifier as tile
+    # start, end of fragment as x, y pixels
+    base_id = "PE%d:%s:%d:%d:%d#0" % (read_len, read_coord.chrom_name, 
+                                      tile, read_coord.left_start, 
+                                      read_coord.right_end)
+
+    # id1 = base_id + "/1"
+    # id2 = base_id + "/2"
+    id1 = id2 = base_id
+
+    # Make left read "read1" 50 % of time
+    if np.random.randint(2) == 0:
+        # make left read read1
+        file1.write("@%s\n%s\n+\n%s\n" % (id1, left_seq, qual_str))
+        file2.write("@%s\n%s\n+\n%s\n" % (id2, right_seq, qual_str))
+    else:
+        file1.write("@%s\n%s\n+\n%s\n" % (id1, right_seq, qual_str))
+        file2.write("@%s\n%s\n+\n%s\n" % (id2, left_seq, qual_str))
+    
+
+
+
+
+def gen_read_coords(options, haps, het_only=True):
+    """generate coordinates for a read pair that overlaps a SNP"""
+    # to make this more efficient, could observer random
+    # vars for many reads at once, rather than single read at a time
+
+    if het_only:
+        # select a heterozygous site to overlap
+        is_het = haps.hap1 != haps.hap2
+        i = numpy.random.randint(np.sum(is_het))
+        snp_pos = haps.pos[is_het][i]
+
+        
+        sys.stderr.write("selected SNP %d %s/%s\n" % 
+                         (snp_pos, chr(haps.ref_allele[i]), chr(haps.alt_allele[i])))
+
+        
+    else:
+        # select any SNP to overlap
+        i = numpy.random.randint(haps.pos.shape[0])
+        snp_pos = haps.pos[i]
+
+
+    # at what read position should het site be?
+    snp_read_pos = numpy.random.randint(options.read_len)
+
+    # what should insert size be?
+    insert_size = numpy.random.normal(options.insert_size_mean,
+                                      options.insert_size_sd)
+    insert_size = int(np.rint(insert_size))
+
+    # insert size cannot be smaller than read size...
+    insert_size = max(options.read_len, insert_size)
+    
+    # does left or right read overlap SNP?
+    if numpy.random.randint(2) == 0:
+        # left read overlaps SNP
+        left_start = snp_pos - snp_read_pos + 1
+        left_end = left_start + options.read_len - 1
+        right_end = left_start + insert_size - 1
+        right_start = right_end - options.read_len + 1
+    else:
+        # right read overlaps SNP
+        right_start = snp_pos - snp_read_pos + 1
+        right_end = right_start + options.read_len - 1
+        left_start = right_end - insert_size + 1
+        left_end = left_start + options.read_len - 1
+
+    read_coord = ReadCoord(options.chrom, left_start, left_end, right_start, right_end)
+
+    return read_coord
+
+
+
+def gen_seqs(read_coord, hap1_seq, hap2_seq):
+    """makes the sequence strings for each read, 
+    choosing 1 haplotype at random to obtain sequence from"""
+    # randomly select haplotype
+    if np.random.randint(2) == 0:
+        chrom_seq = hap1_seq
+    else:
+        chrom_seq = hap2_seq
+
+    # sys.stderr.write("%s-%s %s-%s\n" % (str(read_coord.left_start), 
+    #                                     str(read_coord.left_end),
+    #                                     str(read_coord.right_start),
+    #                                     str(read_coord.right_end)))
+    
+    s = read_coord.left_start - 1
+    e = read_coord.left_end
+    left_read_seq = chrom_seq[s:e]
+    # sys.stderr.write("%s\n%s\n" % (hap1_seq[s:e], hap2_seq[s:e]))
+    
+    s = read_coord.right_start - 1
+    e = read_coord.right_end
+    right_read_seq = revcomp(chrom_seq[s:e])
+    # sys.stderr.write("%s\n%s\n\n" % (hap1_seq[s:e], hap2_seq[s:e]))
+
+    return right_read_seq, left_read_seq
+
+    
+    
+
+
+def make_hap_seqs(haps, options):
+    """Makes a chromosome sequence for each haplotype"""
+    seq_h5 = tables.openFile(options.seq)
+    
+    node_name = "/%s" % options.chrom
+    if node_name not in seq_h5:
+        raise ValueError("chromosome %s is not in sequence h5 file" % options.chrom)
+        
+    seq_node = seq_h5.getNode("/%s" % options.chrom)
+
+    seq_array1 = seq_node[:]
+    seq_array2 = np.array(seq_node[:])
+
+    is_alt = (haps.hap1 == 1)
+    seq_array1[haps.pos[is_alt] - 1] = haps.alt_allele[is_alt]
+
+    is_alt = (haps.hap2 == 1)    
+    seq_array2[haps.pos[is_alt] - 1] = haps.alt_allele[is_alt]
+    
+
+    seq1 = "".join([chr(x) for x in seq_array1])
+    seq2 = "".join([chr(x) for x in seq_array2])
+
+    seq_h5.close()
+
+    return seq1, seq2
+
+
+    
+
+
+def main():
+    options = parse_options()
+
+    sys.stderr.write("reading haplotype information\n")
+    haplotypes = read_haps(options.hap_file)
+    
+    sys.stderr.write("making haplotype sequences\n")
+    hap1_seq, hap2_seq = make_hap_seqs(haplotypes, options)
+
+    if options.out_fastq1.endswith(".gz"):
+        fastq1_file = gzip.open(options.out_fastq1, "w")
+    else:
+        fastq1_file = open(options.out_fastq1, "w")
+
+    if options.out_fastq2.endswith(".gz"):
+        fastq2_file = gzip.open(options.out_fastq2, "w")
+    else:
+        fastq2_file = open(options.out_fastq2, "w")
+    
+    
+    sys.stderr.write("simulating reads\n")
+    for i in range(options.n_reads):
+        # generate coords for a read pair, where one read pair overlaps a SNP
+        read_coord = gen_read_coords(options, haplotypes)
+
+        # make read pair sequences 
+        left_seq, right_seq = gen_seqs(read_coord, hap1_seq, hap2_seq)
+
+        write_reads(fastq1_file, fastq2_file, read_coord, left_seq, right_seq)
+
+    fastq1_file.close()
+    fastq2_file.close()
+
+
+    
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mapping/snake_conf.yaml b/mapping/snake_conf.yaml
new file mode 100644
index 0000000..e4f1e91
--- /dev/null
+++ b/mapping/snake_conf.yaml
@@ -0,0 +1,45 @@
+#
+# This is the Snakemake configuration file that specifies paths and 
+# and options for the mapping pipeline
+#
+
+{
+    # py2 is a hack so that python2 scripts can be called
+    # by snakemake (which is written in python3). The value should
+    # be a command to do whatever is needed to setup a python2 environment and
+    # call the python2 interpreter. Depending on your system configuration,
+    # this might involve setting the PATH environment variable or
+    # just calling python2
+    "py2" : "PATH=$HOME/anaconda2/bin:$PATH; python ",
+
+    # WASP directory containing scripts
+    "wasp_dir" : "/iblm/netapp/home/gmcvicker/proj/WASP",
+
+    # directory to write all output data files to
+    "output_dir" :  "/iblm/netapp/home/gmcvicker/data1/external/Grubert2015/WASP",
+
+    # location of chromInfo file containing chromosome names and lengths
+    # (can be downloaded from UCSC genome browser)
+    "chrom_info" : "/iblm/netapp/home/gmcvicker/proj/WASP/examples/example_data/chromInfo.hg19.txt",
+
+    # directory to read VCF files from (used to write SNP input files)
+    "vcf_dir" : "/iblm/netapp/home/gmcvicker/data1/external/1000Genomes/GRC37/20130502",
+
+    # OLD way: directory to write SNP plain text input files to
+    "snp_dir" : "/iblm/netapp/home/gmcvicker/proj/WASP/examples/example_data/genotypes/snps",
+
+    # directory to write SNP HDF5 input files to
+    snp_h5_dir : "/iblm/netapp/home/gmcvicker/proj/WASP/examples/example_data/genotypes/snp_h5",
+
+    # path to samtools
+    "samtools" : "/iblm/netapp/home/gmcvicker/anaconda2/bin/samtools",
+
+    # path to bowtie2 aligner
+    "bowtie2" : "/iblm/netapp/home/gmcvicker/anaconda2/bin/bowtie2",
+
+    # prefix for bowtie2 reference genome index
+    "bowtie2_index" : "/iblm/netapp/data1/external/GRC37/combined/bowtie2_index/hg37",
+
+    # sample file containing sample names and paths to fastq files
+    "sample_file" : "/iblm/netapp/data1/external/Grubert2015/sample_paths_H3K27ac.1000G_only.txt"
+}
diff --git a/mapping/snptable.py b/mapping/snptable.py
new file mode 100644
index 0000000..b1fdfd8
--- /dev/null
+++ b/mapping/snptable.py
@@ -0,0 +1,412 @@
+import sys
+import numpy as np
+import gzip
+import pysam
+import operator
+
+NUCLEOTIDES = set(['A', 'C', 'T', 'G'])
+SNP_UNDEF = -1
+
+
+# codes for CIGAR string
+BAM_CMATCH     = 0   # M - match/mismatch to ref M
+BAM_CINS       = 1   # I - insertion in read relative to ref
+BAM_CDEL       = 2   # D - deletion in read relative to ref
+BAM_CREF_SKIP  = 3   # N - skipped region from reference (e.g. intron)
+BAM_CSOFT_CLIP = 4   # S - soft clipping (clipped sequence present in seq)
+BAM_CHARD_CLIP = 5   # H - hard clipping (clipped sequence NOT present in seq)
+BAM_CPAD       = 6   # P - padding (silent deletion from padded reference)
+BAM_CEQUAL     = 7   # = - sequence match
+BAM_CDIFF      = 8   # X - sequence mismatch
+
+class SNPTable(object):
+    def __init__(self):
+        self.clear()
+
+    def clear(self):
+        # snp_index and indel_index are arrays of length
+        # max(snp_pos, indel_pos) that provide lookup
+        # into snp_pos, snp_allele1, etc. by chromosome position.
+        # For example, if the first and second snps on the chromosome are
+        # at positions 1234, 1455 then elements 1233 and 1444 of the
+        # snp_index array will be 0 and 1 (and can be used to lookup
+        # info for the SNP in snp_pos, snp_allele1, snp_allele2 arrays)
+        self.snp_index = np.array([], dtype=np.int32)
+        self.snp_pos = np.array([], dtype=np.int32)
+        self.snp_allele1 = np.array([], dtype="|S10")
+        self.snp_allele2 = np.array([], dtype="|S10")
+        self.haplotypes = None
+        self.n_snp = 0
+        self.samples = []
+        
+
+
+    def read_h5(self, snp_tab_h5, snp_index_h5, hap_h5, chrom_name,
+                samples=None):
+        """read in SNPs and indels from HDF5 input files"""
+
+        node_name = "/%s" % chrom_name
+        
+        if node_name not in snp_tab_h5:
+            sys.stderr.write("WARNING: chromosome %s is not "
+                             "in snp_tab.h5 file, assuming no SNPs "
+                             "for this chromosome\n" % chrom_name)
+            self.clear()
+            return
+            
+        else:
+            # get numpy array of SNP idices
+            node = snp_index_h5.getNode(node_name)
+            self.snp_index = node[:]
+
+            # get numpy array of SNP positions
+            node = snp_tab_h5.getNode(node_name)
+            self.snp_pos = node[:]['pos']
+            self.snp_allele1 = node[:]['allele1']
+            self.snp_allele2 = node[:]['allele2']
+            self.n_snp = self.snp_pos.shape[0]
+            self.samples = self.get_h5_samples(hap_h5, chrom_name)
+            self.haplotypes = hap_h5.getNode(node_name)
+            
+            if samples:
+                # reduce set of SNPs and indels to ones that are
+                # polymorphic in provided list of samples
+                samp_idx_dict, samp_idx = self.get_h5_sample_indices(hap_h5, chrom_name, samples)
+                
+                hap_idx = np.empty(samp_idx.shape[0]*2, dtype=np.int)
+                hap_idx[0::2] = samp_idx*2
+                hap_idx[1::2] = samp_idx*2 + 1
+                haps = self.haplotypes[:,hap_idx]
+
+                # count number of ref and non-ref alleles,
+                # ignoring undefined (-1s)
+                nonref_count = np.apply_along_axis(np.sum, 1, haps == 1)
+                ref_count = np.apply_along_axis(np.sum, 1, haps == 0)
+                total_count = nonref_count + ref_count
+                is_polymorphic = (ref_count > 0) & (ref_count < total_count)
+
+                # reduce to set of polymorphic positions
+                sys.stderr.write("reducing %d SNPs on chromosome "
+                                 "%s to %d positions that are polymorphic in "
+                                 "sample of %d individuals\n" %
+                                 (haps.shape[0], chrom_name, 
+                                  np.sum(is_polymorphic), len(samples)))
+
+                # make filtered and ordered samples for this chromosome
+                # that corresponds to order of haplotypes
+                sorted_samps = sorted(samp_idx_dict.items(),
+                                      key=operator.itemgetter(1))
+                self.samples = [x[0] for x in sorted_samps]
+                
+                self.haplotypes = haps[is_polymorphic,]
+                self.snp_pos = self.snp_pos[is_polymorphic]
+                self.snp_allele1 = self.snp_allele1[is_polymorphic]
+                self.snp_allele2 = self.snp_allele2[is_polymorphic]
+                self.n_snp = self.snp_pos.shape[0]
+
+                # regenerate index to point to reduced set of polymorphic SNPs
+                self.snp_index[:] = -1                
+                self.snp_index[self.snp_pos-1] = np.arange(self.n_snp,
+                                                           dtype=np.int32)
+                
+
+    
+    def get_h5_samples(self, h5f, chrom_name):
+        """Reads list of samples that are present in 'samples' table 
+        from haplotype HDF5 file"""
+        samples = None
+
+        node_name = "/samples_%s" % chrom_name
+        
+        if node_name in h5f:
+            node = h5f.getNode(node_name)
+            samples = [row["name"] for row in node]
+        else:
+            raise ValueError("Cannot retrieve haplotypes for "
+                             "specified samples, because haplotype "
+                             "file %s does not contain '%s' table. "
+                             "May need to regenerate haplotype HDF5 file "
+                             "using snp2h5" % (h5f.filename, node_name))
+        return samples
+
+    
+    
+    def get_h5_sample_indices(self, hap_h5, chrom_name, samples):
+        """returns the indices of the the specified samples in the 
+        HDF5 haplotype file. Indices are returned in a dictionary
+        keyed on sample and as an array. Samples that are not 
+        found in the haplotype HDF5 file for the specified chromosome 
+        are not included in the dict or the array."""
+        hap_samples = self.get_h5_samples(hap_h5, chrom_name)
+        not_seen_samples = set(samples)
+        seen_samples = set([])
+        samp_idx = []
+        samp_idx_dict = {}
+        
+        # get haplotype table indices of samples
+        for i in range(len(hap_samples)):
+            if hap_samples[i] in seen_samples:
+                sys.stderr.write("WARNING: sample %s is present multiple "
+                                 "times in haplotype table\n" % hap_samples[i])
+            elif hap_samples[i] in not_seen_samples:
+                # record index of this sample, add to set of samples
+                # we have already observed
+                samp_idx.append(i)
+                samp_idx_dict[hap_samples[i]] = i
+                not_seen_samples.remove(hap_samples[i])
+                seen_samples.add(hap_samples[i])
+            else:
+                # this haplotype sample not in requested list
+                pass
+
+        if len(not_seen_samples) > 0:
+            sys.stderr.write("WARNING: the following samples are not "
+                             "present in haplotype table for chromosome "
+                             "%s: %s" %
+                             (chrom_name, ",".join(not_seen_samples)))
+        
+        return samp_idx_dict, np.array(samp_idx, dtype=np.int)
+
+        
+
+    def is_snp(self, allele1, allele2):
+        """returns True if alleles appear to be 
+        single-nucleotide polymorphism, returns false
+        if appears to be an indel"""
+
+        if (len(allele1) == 1) and (len(allele2) == 1):
+            if allele1 in NUCLEOTIDES and allele2 in NUCLEOTIDES:
+                # this is a SNP
+                return True
+            else:
+                if ("-" in allele1) or ("-" in allele2):
+                    # 1bp indel
+                    return False
+                else:
+                    sys.stderr.write("WARNING: unexpected character "
+                                     "in SNP alleles:\n%s/%s\n" %
+                                     (allele1, allele2))
+                    return False                
+        
+        return False
+        
+
+
+        
+    def read_file(self, filename):
+        """read in SNPs and indels from text input file"""
+        try:
+            if filename.endswith(".gz"):
+                f = gzip.open(filename)
+            else:
+                f = open(filename, "r")
+        except IOError:
+            sys.stderr.write("WARNING: unable to read from file '%s', "
+                             "assuming no SNPs for this chromosome\n" %
+                             filename)
+            self.clear()
+            return
+        
+        snp_pos_list = []
+        snp_allele1_list = []
+        snp_allele2_list = []
+        max_pos = 0
+
+        for line in f:
+            words = line.split()
+
+            if(len(words) < 3):
+                raise ValueError("expected at least 3 values per SNP "
+              			 "file line but got %d:\n"
+                                 "%s\n" % (len(words), line))
+
+            pos = int(words[0])
+            a1 = words[1].upper().replace("-", "")
+            a2 = words[2].upper().replace("-", "")
+
+            if pos <= 0:
+                raise ValueError("expected SNP position to be >= 1:\n%s\n" %
+                                 line)
+
+            if pos > max_pos:
+                max_pos = pos
+
+            snp_pos_list.append(pos)
+            snp_allele1_list.append(a1)
+            snp_allele2_list.append(a2)
+
+        f.close()
+
+        # convert lists to numpy arrays, which allow for faster
+        # lookups and use less memory
+        self.snp_pos = np.array(snp_pos_list, dtype=np.int32)
+        del snp_pos_list
+        self.snp_allele1 = np.array(snp_allele1_list, dtype="|S10")
+        del snp_allele1_list
+        self.snp_allele2 = np.array(snp_allele2_list, dtype="|S10")
+        del snp_allele2_list
+
+        # make another array that makes it easy to lookup SNPs by their position
+        # on the chromosome
+        self.snp_index = np.empty(max_pos, dtype=np.int32)
+        self.snp_index[:] = SNP_UNDEF
+        self.snp_index[self.snp_pos-1] = np.arange(self.snp_pos.shape[0])
+
+        self.n_snp = self.snp_pos.shape[0]
+
+        # currently haplotypes can only be read from HDF5 file
+        self.haplotypes = None
+
+    
+    def get_overlapping_snps(self, read):
+        """Returns several lists: 
+        [1] indices of SNPs that this read overlaps,
+        [2] positions in read sequence that overlap SNPs, 
+        [3] indices for indels that read overlaps, 
+        [4] positions in read sequence that overlap indels. 
+        First base of read is position 1."""
+        
+        # read.cigar is a list of tuples. Each tuple has two entries. The first
+        # entry specifies the character in the cigar and the second entry
+        # specifies the length of that character. The values are
+        # M       BAM_CMATCH      0
+        # I       BAM_CINS        1
+        # D       BAM_CDEL        2
+        # N       BAM_CREF_SKIP   3
+        # S       BAM_CSOFT_CLIP  4
+        # H       BAM_CHARD_CLIP  5
+        # P       BAM_CPAD        6
+        # =       BAM_CEQUAL      7
+        # X       BAM_CDIFF       8
+        # E.g. (0, 5) means 5 matches, and (4, 2) means a soft clip of 2bp
+        read_start = 0
+        read_end = 0
+        genome_start = read.pos
+        genome_end = read.pos
+
+        # index into combined SNP/indel table for overlapping SNPs
+        snp_idx = []
+        # positions in read of overlapping SNPs
+        snp_read_pos = []
+        # index into combined SNP/indel table for overlapping indels
+        indel_idx = []
+        # positions in read of overlapping SNPs
+        indel_read_pos = []
+        
+        for cigar in read.cigar:
+            op = cigar[0] # CIGAR 'operation'
+            op_len  = cigar[1] # length of operation
+            
+            if (op == BAM_CMATCH) or (op == BAM_CEQUAL) or (op == BAM_CDIFF):
+                # match or mismatch to reference
+                read_start = read_end + 1
+                read_end = read_start + op_len - 1
+                genome_start = genome_end + 1
+                genome_end = genome_start + op_len - 1
+
+                # check for SNP in this genome segment
+                s = genome_start - 1
+                e = min(genome_end, self.snp_index.shape[0])
+                s_idx = self.snp_index[s:e]
+                offsets = np.where(s_idx != SNP_UNDEF)[0]
+                
+                if offsets.shape[0] > 0:
+                    # there are overlapping SNPs and/or indels
+                    
+                    for offset in offsets:
+                        read_pos = offset + read_start
+                        
+                        allele1 = self.snp_allele1[s_idx[offset]]
+                        allele2 = self.snp_allele2[s_idx[offset]]
+                        if self.is_snp(allele1, allele2):
+                            snp_idx.append(s_idx[offset])
+                            snp_read_pos.append(read_pos)
+                        else:
+                            indel_idx.append(s_idx[offset])
+                            indel_read_pos.append(read_pos)
+
+            elif op == BAM_CINS:
+                # insert in read relative to reference
+                read_start = read_end + 1
+                read_end = read_start + op_len - 1
+
+                # Genome sequence does not advance, no possibility
+                # for read to overlap SNP, since these bases do
+                # not exist in reference.
+                # INDELs here should be picked up
+                # by one of flanking match segments
+
+            elif op == BAM_CDEL:
+                # deletion in read relative to reference
+                genome_start = genome_end + 1
+                genome_end   = genome_start + op_len - 1
+
+                # Read sequence does not advance, no possibility
+                # for read to overlap SNP, since these bases do
+                # not exist in read
+
+                # in most cases deletion should be picked up
+                # by flanking match segment, but there could be
+                # nested indels
+
+                s = genome_start - 1
+                e = min(genome_end, self.snp_index.shape[0])
+                
+                # check for INDEL in this genome segment
+                s_idx = self.snp_index[s:e]
+                offsets = np.where(s_idx != SNP_UNDEF)[0]
+                
+                if offsets.shape[0] > 0:
+                    # there are overlapping SNPs and/or indels
+                    for offset in offsets:
+                        read_pos = offset + read_start
+                        allele1 = self.snp_allele1[s_idx[offset]]
+                        allele2 = self.snp_allele2[s_idx[offset]]
+                        if self.is_snp(allele1, allele2):
+                            # ignore SNP
+                            pass
+                        else:
+                            indel_idx.append(s_idx[offset])
+                            # position in read is where we last left off
+                            # in read sequence
+                            indel_read_pos.append(read_end)
+            elif op == BAM_CREF_SKIP:
+                # section of skipped reference, such as intron
+                genome_end = genome_end + op_len
+                genome_start = genome_end
+
+                # do nothing with SNPs/indels in this region
+                # since they are skipped
+                
+            elif op == BAM_CSOFT_CLIP:
+                # this part of read skipped
+                read_start = read_end + 1
+                read_end = read_start + op_len - 1
+
+                # This is like insert, but at the beginning of the read.
+                # TODO: handle indels? Sometimes a read can be softclipped
+                # because it contains insert relative to reference, but
+                # in these cases, presumably reference version of read
+                # would map to same location (with higher score).
+
+            elif seq_type == BAM_CHARD_CLIP:
+                # these bases not included in read or genome
+                pass
+
+            elif seq_type == BAM_CPAD:
+                # like an insert, likely only used in multiple-sequence
+                # alignment where inserts may be of different lengths
+                # in different seqs
+                read_start += read_end + 1
+                read_end = read_start + op_len - 1
+
+            else:
+                raise ValueError("unknown CIGAR code %d" % op)
+
+        if read_end != len(read.seq):
+            raise ValueError("length of read segments in CIGAR %d "
+                             "does not add up to query length (%d)" %
+                             (read_end, len(read.seq)))
+        
+        
+        return snp_idx, snp_read_pos, indel_idx, indel_read_pos
diff --git a/mapping/test_data/bad_first_paired/bad_first.to.remap.num.gz b/mapping/test_data/bad_first_paired/bad_first.to.remap.num.gz
deleted file mode 100644
index 9c8cab5..0000000
Binary files a/mapping/test_data/bad_first_paired/bad_first.to.remap.num.gz and /dev/null differ
diff --git a/mapping/test_data/bad_first_paired/bad_first_remapped.bam b/mapping/test_data/bad_first_paired/bad_first_remapped.bam
deleted file mode 100644
index 92e2578..0000000
Binary files a/mapping/test_data/bad_first_paired/bad_first_remapped.bam and /dev/null differ
diff --git a/mapping/test_data/bad_first_paired/bad_first_to_remap.bam b/mapping/test_data/bad_first_paired/bad_first_to_remap.bam
deleted file mode 100644
index c813543..0000000
Binary files a/mapping/test_data/bad_first_paired/bad_first_to_remap.bam and /dev/null differ
diff --git a/mapping/test_data/bad_last_paired/bad_last.to.remap.num.gz b/mapping/test_data/bad_last_paired/bad_last.to.remap.num.gz
deleted file mode 100644
index 5b39745..0000000
Binary files a/mapping/test_data/bad_last_paired/bad_last.to.remap.num.gz and /dev/null differ
diff --git a/mapping/test_data/bad_last_paired/bad_last_remapped.bam b/mapping/test_data/bad_last_paired/bad_last_remapped.bam
deleted file mode 100644
index 70e36c4..0000000
Binary files a/mapping/test_data/bad_last_paired/bad_last_remapped.bam and /dev/null differ
diff --git a/mapping/test_data/bad_last_paired/bad_last_to_remap.bam b/mapping/test_data/bad_last_paired/bad_last_to_remap.bam
deleted file mode 100644
index 9ef5588..0000000
Binary files a/mapping/test_data/bad_last_paired/bad_last_to_remap.bam and /dev/null differ
diff --git a/mapping/test_data/bad_middle_paired/bad_middle.to.remap.num.gz b/mapping/test_data/bad_middle_paired/bad_middle.to.remap.num.gz
deleted file mode 100644
index 807ecca..0000000
Binary files a/mapping/test_data/bad_middle_paired/bad_middle.to.remap.num.gz and /dev/null differ
diff --git a/mapping/test_data/bad_middle_paired/bad_middle_remapped.bam b/mapping/test_data/bad_middle_paired/bad_middle_remapped.bam
deleted file mode 100644
index 15564d5..0000000
Binary files a/mapping/test_data/bad_middle_paired/bad_middle_remapped.bam and /dev/null differ
diff --git a/mapping/test_data/bad_middle_paired/bad_middle_to_remap.bam b/mapping/test_data/bad_middle_paired/bad_middle_to_remap.bam
deleted file mode 100644
index 1aaf98e..0000000
Binary files a/mapping/test_data/bad_middle_paired/bad_middle_to_remap.bam and /dev/null differ
diff --git a/mapping/test_data/issue_18.bam b/mapping/test_data/issue_18.bam
deleted file mode 100644
index cb441c3..0000000
Binary files a/mapping/test_data/issue_18.bam and /dev/null differ
diff --git a/mapping/test_data/issue_18_snps/chr7.snps.txt.gz b/mapping/test_data/issue_18_snps/chr7.snps.txt.gz
deleted file mode 100644
index 3c17786..0000000
Binary files a/mapping/test_data/issue_18_snps/chr7.snps.txt.gz and /dev/null differ
diff --git a/mapping/test_data/snps/chr1.snps.txt.gz b/mapping/test_data/snps/chr1.snps.txt.gz
deleted file mode 100644
index 063a9b0..0000000
Binary files a/mapping/test_data/snps/chr1.snps.txt.gz and /dev/null differ
diff --git a/mapping/test_data/test_issue_18.remapped.bam b/mapping/test_data/test_issue_18.remapped.bam
deleted file mode 100644
index 808947f..0000000
Binary files a/mapping/test_data/test_issue_18.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_issue_23.remapped.bam b/mapping/test_data/test_issue_23.remapped.bam
deleted file mode 100644
index deda549..0000000
Binary files a/mapping/test_data/test_issue_23.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_issue_23.sort.bam b/mapping/test_data/test_issue_23.sort.bam
deleted file mode 100644
index cb441c3..0000000
Binary files a/mapping/test_data/test_issue_23.sort.bam and /dev/null differ
diff --git a/mapping/test_data/test_paired.remapped.bam b/mapping/test_data/test_paired.remapped.bam
deleted file mode 100644
index 808947f..0000000
Binary files a/mapping/test_data/test_paired.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_paired.sort.bam b/mapping/test_data/test_paired.sort.bam
deleted file mode 100644
index f6eb4e7..0000000
Binary files a/mapping/test_data/test_paired.sort.bam and /dev/null differ
diff --git a/mapping/test_data/test_paired_reverse.remapped.bam b/mapping/test_data/test_paired_reverse.remapped.bam
deleted file mode 100644
index cfd0394..0000000
Binary files a/mapping/test_data/test_paired_reverse.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_paired_reverse.sort.bam b/mapping/test_data/test_paired_reverse.sort.bam
deleted file mode 100644
index 6adb45c..0000000
Binary files a/mapping/test_data/test_paired_reverse.sort.bam and /dev/null differ
diff --git a/mapping/test_data/test_paired_unmapped.remapped.bam b/mapping/test_data/test_paired_unmapped.remapped.bam
deleted file mode 100644
index 6e67e02..0000000
Binary files a/mapping/test_data/test_paired_unmapped.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_single.remapped.bam b/mapping/test_data/test_single.remapped.bam
deleted file mode 100644
index 8395f19..0000000
Binary files a/mapping/test_data/test_single.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_single.sort.bam b/mapping/test_data/test_single.sort.bam
deleted file mode 100644
index 4d6105e..0000000
Binary files a/mapping/test_data/test_single.sort.bam and /dev/null differ
diff --git a/mapping/test_data/test_single_reverse.remapped.bam b/mapping/test_data/test_single_reverse.remapped.bam
deleted file mode 100644
index d5e9d54..0000000
Binary files a/mapping/test_data/test_single_reverse.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_single_reverse.sort.bam b/mapping/test_data/test_single_reverse.sort.bam
deleted file mode 100644
index 997c13d..0000000
Binary files a/mapping/test_data/test_single_reverse.sort.bam and /dev/null differ
diff --git a/mapping/test_data/test_single_unmapped.remapped.bam b/mapping/test_data/test_single_unmapped.remapped.bam
deleted file mode 100644
index 25b7c8f..0000000
Binary files a/mapping/test_data/test_single_unmapped.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_two_snps_paired.remapped.bam b/mapping/test_data/test_two_snps_paired.remapped.bam
deleted file mode 100644
index 808947f..0000000
Binary files a/mapping/test_data/test_two_snps_paired.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/test_two_snps_single.remapped.bam b/mapping/test_data/test_two_snps_single.remapped.bam
deleted file mode 100644
index 8ef180a..0000000
Binary files a/mapping/test_data/test_two_snps_single.remapped.bam and /dev/null differ
diff --git a/mapping/test_data/two_snps/chr1.snps.txt.gz b/mapping/test_data/two_snps/chr1.snps.txt.gz
deleted file mode 100644
index 12122d6..0000000
Binary files a/mapping/test_data/two_snps/chr1.snps.txt.gz and /dev/null differ
diff --git a/mapping/test_filter_remapped_reads.py b/mapping/test_filter_remapped_reads.py
index ba799f8..5fbdcb7 100644
--- a/mapping/test_filter_remapped_reads.py
+++ b/mapping/test_filter_remapped_reads.py
@@ -1,354 +1,230 @@
-import glob
-import gzip
+import sys
 import os
 import subprocess
 
-from find_intersecting_snps import *
-from filter_remapped_reads import *
+import filter_remapped_reads
+import util
+#
+# filter_remapped_reads.py
+#  INPUT FILES: 
+#   to_remap_bam - input BAM file containing original set of reads
+#                  that need to be remapped after having their alleles flipped
+#
+#   remap_bam - input BAM file containing remapped reads. Read names in this
+#               file should be delimited with the '.' character and 
+#               contain the following fields:
+#                  <orig_name>.<coordinate>.<read_number>.<total_read_number>
+#
+#               For single-end reads <coordinate> is the left end of the read
+#               (e.g. 16052611)
+#               For paired-end reads the coordinate is the start of the 
+#               the left read and start of the right read:
+#               (e.g. 16052611-16052734)
+#               
+#
+#
+# OUTPUT FILES:
+#   keep_bam - ouput BAM file containing reads that are retained
+#              after filtering
+#          
+#
+
+
+#
+# TODO: need to verify that interleaved read pairs handled appropriately
+# TODO: need to test single end reads
+#
+#
+
+
+def write_sam_header(f):
+    f.write("@HD	VN:1.0	SO:coordinate\n")
+    f.write("@SQ	SN:chr22	LN:51304566\n")
+    f.write('@PG	ID:bowtie2	PN:bowtie2	VN:2.2.6	CL:"/iblm/netapp/home/gmcvicker/anaconda2/bin/bowtie2-align-s --wrapper basic-0 -x /iblm/netapp/data1/external/GRC37/combined/bowtie2_index/hg37 -1 /tmp/16686.inpipe1 -2 /tmp/16686.inpipe2\n')
+
+
+
+
+def write_to_remap_bam_pe(data_dir="test_data", bam_filename="test_data/test.to.remap.bam"):
+    sam_lines = ["SRR1658224.34085432	163	chr22	16052611	12	101M	=	16052734	224	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                 "SRR1658224.34085432	83	chr22	16052734	12	101M	=	16052611	-224	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
+                 "SRR1658224.34975561	99	chr22	16071944	12	101M	=	16072163	320	ATTTATTTATTTATTTATTATTGGGACAGAGTCTCACTCTGTCCCCCAGACTGGAGTCCAGTGACATGATCTCAGCTCACTGCAACCTCTGCCTCGTGGGT	CCCFFFFFHHHHHJJJJJJJJJJJJIJJJJIEHIJJJJJJJIIJJJJJIJJJJJJJJJJIJHIJIJJJJIJJJJJHHHHHHFFFFFECEEEEDDDDDDBBD	AS:i:-5	XS:i:-22	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:89C11	YS:i:0	YT:Z:CP",
+                 "SRR1658224.34975561	147	chr22	16072163	12	101M	=	16071944	-320	GTCTCAAACTTCTGACCTCAGGTGATCCACCCACCTCGACCTCCCAAAGTGCTGGGATTACAGGCACTAGGTCCCTAAATTAGAGCCATATTCTTTAATGT	DDBCDEDCDCCDCC?DDDDDDDBACBDA<FFB:6HIIJIIJIIJJJJJJJJJJJJIJJIHJJJJJIJJJJJJJJJJJJJJJJJJJJJJHHHGGFFFFFCCC	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",
+                 "SRR1658224.7462188	163	chr22	16235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB?	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",
+                 "SRR1658224.7462188	83	chr22	16235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATATGCATTAATATTTTCATACAACTTCCAGCTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	CBDDDDECEEDEFFFDFFFHHHHHHHJJIIJJIHIHFHGHJJJJJJJGJJJJJIJJJIIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-5	XS:i:-39	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:15G85	YS:i:0	YT:Z:CP",
+                 "SRR1658224.31153145	163	chr22	16235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-2	YT:Z:CP",
+                 "SRR1658224.31153145	83	chr22	16235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-2	XS:i:-36	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:44C56	YS:i:0	YT:Z:CP",
+                 "SRR1658224.25014179	163	chr22	16236979	31	101M	=	16237137	259	ATGTTTTTTAAGATTTAATATTACTTTTTCCAACATCTTTTTATCCTCAAGTTTTTTATATTCCTGTTGTATTTTTTTATAGATAATAACTCCTGTTGAAT	CCCFFFFFHHHHFIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHGIJJJJJJJJIJJJJJJJHHHHHHHDCDDECDEEDDEDDDDDDDDDDCDC	AS:i:0	XS:i:-28	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:0	YT:Z:CP",
+                 "SRR1658224.25014179	83	chr22	16237137	31	101M	=	16236979	-259	TCATCGAACTACATTAATAAAATAATATAGCTTGATAATGAAGTAGGCTGAGAATAATCTCATACAAAACCAATAACAAATTTTGAAATACATTTACTTGC	CEFFFFFHHHHHHHHJJJJJJJJJIHJIJIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJIIJJJIHJJJJJJIJJJJJJJJJJJJHHHHHFDDFFCCC	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:0	YT:Z:CP",
+                 "readpair1	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                 "readpair2	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                 "readpair1	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",        
+                 "readpair2	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP"]          
+
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+        
+    # write temporary file in SAM format, before converting to BAM
+    sam_filename = data_dir + "/tmp.sam"
+    f = open(sam_filename, "w")
+    write_sam_header(f)
+    for line in sam_lines:
+        f.write(line + "\n")
+    f.close()
+
+    subprocess.check_call("samtools view -b %s > %s" % (sam_filename, bam_filename), shell=True)
+
+
+    
+def write_remap_bam_pe(data_dir="test_data", bam_filename="test_data/test.remap.bam"):
+    sam_lines = [
+        # Read pair expected to map 2 times and maps to correct location 2 times
+        "SRR1658224.34085432.16052611-16052734.1.2	163	chr22	16052611	12	101M	=	16052734	224	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+        "SRR1658224.34085432.16052611-16052734.1.2	83	chr22	16052734	12	101M	=	16052611	-224	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
+        "SRR1658224.34085432.16052611-16052734.2.2	163	chr22	16052611	12	101M	=	16052734	224	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+        "SRR1658224.34085432.16052611-16052734.2.2	83	chr22	16052734	12	101M	=	16052611	-224	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
+        
+
+        # Read pair expected to map 2 times, but only maps 1 time
+        "SRR1658224.34975561.16071944-16072163.2.2	99	chr22	16071944	12	101M	=	16072163	320	ATTTATTTATTTATTTATTATTGGGACAGAGTCTCACTCTGTCCCCCAGACTGGAGTCCAGTGACATGATCTCAGCTCACTGCAACCTCTGCCTCGTGGGT	CCCFFFFFHHHHHJJJJJJJJJJJJIJJJJIEHIJJJJJJJIIJJJJJIJJJJJJJJJJIJHIJIJJJJIJJJJJHHHHHHFFFFFECEEEEDDDDDDBBD	AS:i:-5	XS:i:-22	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:89C11	YS:i:0	YT:Z:CP",
+        "SRR1658224.34975561.16071944-16072163.2.2	147	chr22	16072163	12	101M	=	16071944	-320	GTCTCAAACTTCTGACCTCAGGTGATCCACCCACCTCGACCTCCCAAAGTGCTGGGATTACAGGCACTAGGTCCCTAAATTAGAGCCATATTCTTTAATGT	DDBCDEDCDCCDCC?DDDDDDDBACBDA<FFB:6HIIJIIJIIJJJJJJJJJJJJIJJIHJJJJJIJJJJJJJJJJJJJJJJJJJJJJHHHGGFFFFFCCC	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",
+
+
+        # Read pair expected to map 2 times, but only 1/2 of 2nd pair maps back to same location
+        "SRR1658224.7462188.16235410-16235625.1.2	163	chr22	16235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB?	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",
+        "SRR1658224.7462188.16235410-16235625.1.2	83	chr22	16235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATATGCATTAATATTTTCATACAACTTCCAGCTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	CBDDDDECEEDEFFFDFFFHHHHHHHJJIIJJIHIHFHGHJJJJJJJGJJJJJIJJJIIJJJJJJJJJJJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-5	XS:i:-39	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:15G85	YS:i:0	YT:Z:CP",
+        "SRR1658224.7462188.16235410-16235625.2.2	163	chr22	16235410	17	101M	*	0	0	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CC@FFFFFHHHHHJJJJJJJJJJJJJJJJIJBGIJJJJJJJJJJJJJIJIFIJJJJJJJJJHHHHGFFFFFFEEEEDEEDDDDDEED@CFFFEDDD?ABB?	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-5	YT:Z:CP",
+
+        
+        # Read pair expected to map 2 times, but 1 pair maps to wrong location
+        "SRR1658224.31153145.16235410-16235625.1.2	163	chr22	16235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-2	YT:Z:CP",
+        "SRR1658224.31153145.16235410-16235625.1.2	83	chr22	16235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-2	XS:i:-36	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:44C56	YS:i:0	YT:Z:CP",
+        "SRR1658224.31153145.16235410-16235625.2.2	163	chr22	18235410	17	101M	=	16235625	316	AGATAATTGTCTTATTTTTTTAAAAAAAGAGTAACTTTATATTATGGAATTCATAATATTTGAGACTATAATGCATGACATAAATAGTATAAAGGAGAGAG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJJIJFHIJJJJJJJJJJJIJIJJFHIJJJJJJJJHHHHHFFFFFFEDEEEEEDDDDDEED@DEEEEDDDDDDB2	AS:i:0	XS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-2	YT:Z:CP",
+        "SRR1658224.31153145.16235410-16235625.2.2	83	chr22	18235625	17	101M	=	16235410	-316	TTCAAAAGATGGTATGTGCATTAATATTTTCATACAACTTCCAGTTTTTGTTTTTCTTCATTTAATTTTATTTATTTATTTATTTTTGAGATGGAGTCTCG	DDDDDDDDEEEEEEFFFFFFHHHHGHHJJIJJJIIJIJIHJHF@(JJJJJJJJJJJJIIIIJJJJJJJIJJJJJJJJJJJJJJJJJJJHHHHHFFFDFCCC	AS:i:-2	XS:i:-36	XN:i:0	XM:i:1	XO:i:0	XG:i:0	NM:i:1	MD:Z:44C56	YS:i:0	YT:Z:CP",
+
+        # Read pair expected to map 2 times, but does not map at all
+        # "SRR1658224.25014179"
+
+
+        # Read pairs expected to map 1 times, with read-pairs interleaved
+        "readpair1.100-200.1.2	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+        "readpair2.150-250.1.2	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+        "readpair1.100-200.1.2	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",        
+        "readpair2.150-250.1.2	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+        "readpair1.100-200.2.2	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+        "readpair2.150-250.2.2	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+        "readpair1.100-200.2.2	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",        
+        "readpair2.150-250.2.2	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP"
+    ]
+
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+
+        
+    # write temporary file in SAM format, before converting to BAM
+    sam_filename = data_dir + "/tmp.sam"
+    f = open(sam_filename, "w")
+    write_sam_header(f)
+    for line in sam_lines:
+        f.write(line + "\n")
+    f.close()
+
+    # write to temp bam file
+    tmp_bam_filename = data_dir + "/tmp.bam"
+    subprocess.check_call("samtools view -b %s > %s" % (sam_filename, tmp_bam_filename), shell=True)
+    # sort the temp bam file
+    util.sort_bam(tmp_bam_filename, data_dir + "/tmp")
+    # remove temp bam
+    os.remove(tmp_bam_filename)
+    # rename sorted bam to output bam filename
+    os.rename(data_dir + "/tmp.sort.bam", bam_filename)
 
+    
 def read_bam(bam):
     """
     Read a bam file into a list where each element of the list is a line from
     the bam file (with the newline stripped). The header is discarded.
     """
-    res = subprocess.check_output('samtools view {}'.format(bam), shell=True)
+    res = subprocess.check_output('samtools view %s' % bam, shell=True)
     return res.strip().split('\n')
 
-def cleanup():
-    fns = (glob.glob('test_data/test*.keep.bam') +
-           glob.glob('test_data/test*.remap.fq*.gz') + 
-           glob.glob('test_data/test*.to.remap.bam') + 
-           glob.glob('test_data/test*.to.remap.num.gz') + 
-           glob.glob('test_data/test_*_filtered.bam'))
-    [os.remove(x) for x in fns]
-
-class TestRun:
-    def test_simple_single(self):
-        is_paired_end = False
-        max_window = 100000
-        pref = 'test_data/test_single'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-
-        keep_bam = pref + '_filtered.bam'
-        run(remap_name, 'test_data/test_single.remapped.bam', keep_bam,
-            remap_num_name, is_paired_end)
-
-        lines = read_bam(keep_bam)
-        assert len(lines) == 1
-
-        cleanup()
-    
-    def test_simple_paired(self):
-        is_paired_end = True
-        max_window = 100000
-        pref = 'test_data/test_paired'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq1.gz",
-                       pref + ".remap.fq2.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-        
-        keep_bam = pref + '_filtered.bam'
-        run(remap_name, 'test_data/test_paired.remapped.bam', keep_bam,
-            remap_num_name, is_paired_end)
-        
-        lines = read_bam(keep_bam)
-        assert len(lines) == 2
-
-        cleanup()
-
-    def test_simple_single_unmapped(self):
-        """Test to make sure that if the read pair is unmapped in the remapping
-        stage, it is not written to the final output file."""
-        is_paired_end = False
-        max_window = 100000
-        pref = 'test_data/test_single'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-        
-        keep_bam = pref + '_filtered.bam'
-        run(remap_name, 'test_data/test_single_unmapped.remapped.bam', keep_bam,
-            remap_num_name, is_paired_end)
-        
-        lines = read_bam(keep_bam)
-        assert lines == ['']
-
-        cleanup()
-
-    def test_simple_paired_unmapped(self):
-        """Test to make sure that if the read pair is unmapped in the remapping
-        stage, it is not written to the final output file."""
-        is_paired_end = True
-        max_window = 100000
-        pref = 'test_data/test_paired'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq1.gz",
-                       pref + ".remap.fq2.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-        
-        keep_bam = pref + '_filtered.bam'
-        run(remap_name, 'test_data/test_paired_unmapped.remapped.bam', keep_bam,
-            remap_num_name, is_paired_end)
-        
-        lines = read_bam(keep_bam)
-        assert lines == ['']
-
-        cleanup()
-
-    def test_simple_single_reverse(self):
-        """Test to make sure that if the read is mapped correctly on the reverse
-        strand in the remapping stage, it is written to the final output
-        file."""
-        is_paired_end = False
-        max_window = 100000
-        pref = 'test_data/test_single_reverse'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-        
-        keep_bam = pref + '_filtered.bam'
-        run(remap_name, 'test_data/test_single_reverse.remapped.bam', keep_bam,
-            remap_num_name, is_paired_end)
-        
-        lines = read_bam(keep_bam)
-        assert len(lines) == 1
-
-        cleanup()
-
-    def test_simple_paired_reverse(self):
-        """Test to make sure that if the first read in the read pair is mapped
-        correctly on the reverse strand in the remapping stage, it is written to
-        the final output file."""
-        cleanup()
-        is_paired_end = True
-        max_window = 100000
-        pref = 'test_data/test_paired_reverse'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq1.gz",
-                       pref + ".remap.fq2.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-        
-        keep_bam = pref + '_filtered.bam'
-        run(remap_name, 'test_data/test_paired_reverse.remapped.bam', keep_bam,
-            remap_num_name, is_paired_end)
-        
-        lines = read_bam(keep_bam)
-        assert len(lines) == 2
-
-        cleanup()
-
-    def test_two_snps_single(self):
-        is_paired_end = False
-        max_window = 100000
-        pref = 'test_data/test_single'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq.gz"]
-        snp_dir = 'test_data/two_snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-
-        keep_bam = pref + '_filtered.bam'
-        run(remap_name, 'test_data/test_two_snps_single.remapped.bam', keep_bam,
-            remap_num_name, is_paired_end)
+
+def test_filter_remapped_reads_pe():
+    test_dir = "test_data"
+    to_remap_bam_filename = "test_data/test.to.remap.bam"
+    remap_bam_filename = "test_data/test.remap.bam"
+    keep_bam_filename = "test_data/keep.bam"
+
+    # write test input data
+    write_to_remap_bam_pe(data_dir=test_dir, bam_filename=to_remap_bam_filename)
+    write_remap_bam_pe(data_dir=test_dir, bam_filename=remap_bam_filename)
+
+    # run filter remapped reads
+    filter_remapped_reads.main(to_remap_bam_filename, remap_bam_filename, keep_bam_filename)
         
-        lines = read_bam(keep_bam)
-        assert len(lines) == 1
-
-        cleanup()
-
-    # def test_two_snps_paired(self):
-    #     # TODO: The remapped bam file should have six entries, only has two.
-    #     is_paired_end = True
-    #     max_window = 100000
-    #     pref = 'test_data/test_paired'
-    #     file_name = pref + ".sort.bam"
-    #     keep_file_name = pref + ".keep.bam"
-    #     remap_name = pref + ".to.remap.bam"
-    #     remap_num_name = pref + ".to.remap.num.gz"
-    #     fastq_names = [pref + ".remap.fq1.gz",
-    #                    pref + ".remap.fq2.gz"]
-    #     snp_dir = 'test_data/two_snps'
-    #     bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-    #                     remap_name, remap_num_name, fastq_names, snp_dir)
-    #     bs.run()
-    #     
-    #     keep_bam = pref + '_filtered.bam'
-    #     run(remap_name, 'test_data/test_two_snps_paired.remapped.bam', keep_bam,
-    #         remap_num_name, is_paired_end)
-
-    #     lines = read_bam(keep_bam)
-    #     assert len(lines) == 2
-
-    #     cleanup()
-
-    # def test_issue_18(self):
-    #     """
-    #     This was reported as a bug because one read pair that overlaps one SNP
-    #     was resulting in multiple pairs of reads in the fastq files. However, it
-    #     is not a bug because the reads overlap and both reads overlap the SNP.
-    #     """
-    #     # TODO: The remapped bam file does not correspond to the input bam file.
-    #     is_paired_end = True
-    #     max_window = 100000
-    #     file_name = 'test_data/issue_18.bam'
-    #     pref = 'test_data/test_paired'
-    #     keep_file_name = pref + ".keep.bam"
-    #     remap_name = pref + ".to.remap.bam"
-    #     remap_num_name = pref + ".to.remap.num.gz"
-    #     fastq_names = [pref + ".remap.fq1.gz",
-    #                    pref + ".remap.fq2.gz"]
-    #     snp_dir = 'test_data/issue_18_snps'
-    #     bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-    #                     remap_name, remap_num_name, fastq_names, snp_dir)
-    #     bs.run()
-    #     
-    #     keep_bam = pref + '_filtered.bam'
-    #     run(remap_name, 'test_data/test_issue_18.remapped.bam', keep_bam,
-    #         remap_num_name, is_paired_end)
-
-    #     lines = read_bam(keep_bam)
-    #     assert len(lines) == 2
-
-    #     cleanup()
-    # 
-    # def test_issue_23(self):
-    #     """
-    #     This was reported as a bug because WASP said the read pair mapped
-    #     correctly yet wasn't written to the output file.
-    #     """
-    #     # TODO: I think the test above was copied down but the test for but the
-    #     # test for issue 23 was not implemented. Maybe I accidentally copied
-    #     # over it at some point.
-    #     is_paired_end = True
-    #     max_window = 100000
-    #     file_name = 'test_data/issue_18.bam'
-    #     pref = 'test_data/test_paired'
-    #     keep_file_name = pref + ".keep.bam"
-    #     remap_name = pref + ".to.remap.bam"
-    #     remap_num_name = pref + ".to.remap.num.gz"
-    #     fastq_names = [pref + ".remap.fq1.gz",
-    #                    pref + ".remap.fq2.gz"]
-    #     snp_dir = 'test_data/issue_18_snps'
-    #     bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-    #                     remap_name, remap_num_name, fastq_names, snp_dir)
-    #     bs.run()
-    #     
-    #     keep_bam = pref + '_filtered.bam'
-    #     run(remap_name, 'test_data/test_issue_18.remapped.bam', keep_bam,
-    #         remap_num_name, is_paired_end)
-
-    #     lines = read_bam(keep_bam)
-    #     assert len(lines) == 2
-
-    #     cleanup()
+    # read in filtered reads
+    lines = read_bam(keep_bam_filename)
+    
+    # read lines from keep BAM file
+    read_dict = {}
+    for line in lines:
+        words = line.split()
+        read_name = words[0]
+        if read_name in read_dict:
+            read_dict[read_name].append(words)
+        else:
+            read_dict[read_name] = [words]
+                    
+    # verify that filtered reads look correct
+
+    # we expect a read pair with this identifier:
+    read_name = "SRR1658224.34085432"
+    assert read_name in read_dict
+    reads = read_dict[read_name]
+    assert len(reads) == 2
+
+    pos1 = int(reads[0][3])
+    pos2 = int(reads[1][3])
+    assert pos1 == 16052611
+    assert pos2 == 16052734
+
+    
+    # expect these read pairs to be filtered out (not present)
+    # only one version of read pair maps (expect 2)
+    assert "SRR1658224.34975561" not in read_dict
+
+    # 1/2 of second read pair missing
+    assert "SRR1658224.7462188" not in read_dict
+
+    # 1 pair maps to wrong location
+    assert "SRR1658224.31153145" not in read_dict
+
+    # neither pair maps
+    assert "SRR1658224.25014179" not in read_dict
+
+    # expect these (interleaved) read pairs to be kept
+    read_name = "readpair1"
+    assert read_name in read_dict
+    reads = read_dict[read_name]
+    assert len(reads) == 2
+    pos1 = int(reads[0][3])
+    pos2 = int(reads[1][3])
+    assert pos1 == 100
+    assert pos2 == 200
+
+    read_name = "readpair2"
+    assert read_name in read_dict
+    reads = read_dict[read_name]
+    assert len(reads) == 2
+    pos1 = int(reads[0][3])
+    pos2 = int(reads[1][3])
+    assert pos1 == 150
+    assert pos2 == 250
+
+    
+
     
-    def test_bad_first_paired(self):
-        """Test whether the correct read pairs are output if the first read pair
-        is incorrectly remapped."""
-        remap_name = 'test_data/bad_first_paired/bad_first_to_remap.bam'
-        keep_bam = 'test_data/test_bad_first_filtered.bam'
-        remapped_bam = 'test_data/bad_first_paired/bad_first_remapped.bam' 
-        remap_num_name = 'test_data/bad_first_paired/bad_first.to.remap.num.gz'
-        is_paired_end = True
-        run(remap_name, remapped_bam, keep_bam, remap_num_name, is_paired_end)
-        
-        lines = read_bam(keep_bam)
-        assert len(lines) == 6
-
-        cleanup()
-
-    def test_bad_middle_paired(self):
-        """Test whether the correct read pairs are output if a read pair in the
-        middle is incorrectly remapped."""
-        remap_name = 'test_data/bad_middle_paired/bad_middle_to_remap.bam'
-        keep_bam = 'test_data/test_bad_middle_filtered.bam'
-        remapped_bam = 'test_data/bad_middle_paired/bad_middle_remapped.bam' 
-        remap_num_name = 'test_data/bad_middle_paired/bad_middle.to.remap.num.gz'
-        is_paired_end = True
-        run(remap_name, remapped_bam, keep_bam, remap_num_name, is_paired_end)
-        
-        lines = read_bam(keep_bam)
-        assert len(lines) == 6
-
-    def test_bad_last_paired(self):
-        """Test whether the correct read pairs are output if the last read pair
-        is incorrectly remapped."""
-        remap_name = 'test_data/bad_last_paired/bad_last_to_remap.bam'
-        keep_bam = 'test_data/test_bad_last_filtered.bam'
-        remapped_bam = 'test_data/bad_last_paired/bad_last_remapped.bam' 
-        remap_num_name = 'test_data/bad_last_paired/bad_last.to.remap.num.gz'
-        is_paired_end = True
-        run(remap_name, remapped_bam, keep_bam, remap_num_name, is_paired_end)
         
-        lines = read_bam(keep_bam)
-        assert len(lines) == 6
-
-        cleanup()
-
-class TestCLI:
-    def test_simple_single_cli(self):
-        is_paired_end = False
-        max_window = 100000
-        pref = 'test_data/test_single'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-
-        keep_bam = pref + '_filtered.bam'
-        c = ('python filter_remapped_reads.py {} {} {} {}'.format(
-            remap_name, 'test_data/test_single.remapped.bam', keep_bam,
-            remap_num_name))
-        subprocess.check_call(c, shell=True)
-
-        lines = read_bam(keep_bam)
-        assert len(lines) == 1
-
-        cleanup()
+
+    
+    
diff --git a/mapping/test_find_intersecting_snps.py b/mapping/test_find_intersecting_snps.py
index 76bccba..f90d333 100644
--- a/mapping/test_find_intersecting_snps.py
+++ b/mapping/test_find_intersecting_snps.py
@@ -1,339 +1,2373 @@
 import glob
 import gzip
 import os
+import os.path
 import subprocess
+import sys
+import tables
+import numpy as np
 
-from find_intersecting_snps import *
+import find_intersecting_snps
 
 def read_bam(bam):
     """
     Read a bam file into a list where each element of the list is a line from
     the bam file (with the newline stripped). The header is discarded.
     """
-    res = subprocess.check_output('samtools view {}'.format(bam), shell=True)
+    res = subprocess.check_output('samtools view %s' % bam, shell=True)
     return res.strip().split('\n')
 
-def cleanup():
-    fns = (glob.glob('test_data/test*.keep.bam') +
-           glob.glob('test_data/test*.remap.fq*.gz') + 
-           glob.glob('test_data/test*.to.remap.bam') + 
-           glob.glob('test_data/test*.to.remap.num.gz'))
-    [os.remove(x) for x in fns]
-
-class TestSNP:
-    def test_init(self):
-        """Test to see whether __init__ is working as expected."""
-        snp = SNP('12670\tG\tC\n')
-        assert snp.pos == 12670 - 1
-        assert snp.alleles == ['G', 'C']
-        assert snp.ptype == 'snp'
-        assert snp.max_len == 1
-
-    def test_add_allele(self):
-        """Test to see whether we can add an allele."""
-        snp = SNP('12670\tG\tC\n')
-        snp.add_allele(['A'])
-        assert snp.alleles == ['G', 'C', 'A']
+
+
+
+
+class Data(object):
+    """This class creates data that can be used for the tests"""
+
+    def __init__(self,
+                 data_dir="test_data",
+                 prefix="test_data/test",
+                 output_prefix=None,
+                 read1_seqs =  ["AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"],
+                 read2_seqs =  ["TTTTTTTTTTATTTTTTTTTTTTTTTTTTT"],
+                 read1_quals = ["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                 read2_quals = ["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                 genome_seqs = ["AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n" +
+                                "TTTTTTTTTTATTTTTTTTTTTTTTTTTTT"],
+                 chrom_names = ['test_chrom'],
+                 read1_names = None,
+                 read2_names = None,
+                 snp_list = [['test_chrom', 1, "A", "C"]],
+                 hap_samples = ["samp1", "samp2", "samp3", "samp4"],
+                 haplotypes = [[0, 1, 0, 1]]):
+
+        if output_prefix is None:
+            self.output_prefix = prefix
+        else:
+            self.output_prefix = output_prefix
+        
+        self.data_dir = data_dir
+        self.prefix = prefix
+        self.read1_seqs  = read1_seqs
+        self.read1_quals = read1_quals
+        self.read2_seqs  = read2_seqs
+        self.read2_quals = read2_quals
+        self.genome_seqs = list(genome_seqs)
+        self.snp_list = list(snp_list)
+
+        self.read1_names = read1_names
+        self.read2_names = read2_names
+        
+        self.genome_prefix = self.prefix + "_genome"
+        self.genome_filename = self.genome_prefix + ".fa"
+        self.chrom_names = list(chrom_names)
+
+        self.hap_samples = hap_samples
+        self.haplotypes = haplotypes
+        
+        
+        self.fastq1_filename = self.prefix + "_1.fq"
+        self.fastq2_filename = self.prefix + "_2.fq"
+
+        self.sam_filename = self.prefix + ".sam"
+        self.bam_sort_filename = self.prefix + ".sort.bam"
+        self.bam_filename = self.prefix + ".bam"
+
+        self.snp_dir = self.prefix + "_snps"
+
+        # these are files that are written by find_intersecting_snps
+        self.bam_keep_filename = self.output_prefix + ".keep.bam"
+        self.bam_remap_filename = self.output_prefix + ".to.remap.bam"
+        self.fastq_remap_filename = self.output_prefix + ".remap.fq.gz"
+        self.fastq1_remap_filename = self.output_prefix + ".remap.fq1.gz"
+        self.fastq2_remap_filename = self.output_prefix + ".remap.fq2.gz"
+        
+        self.snp_tab_filename = self.prefix + "_snp_tab.h5"
+        self.snp_index_filename = self.prefix + "_snp_index.h5"
+        self.haplotype_filename = self.prefix + "_haplotype.h5"
+        
+
+
+    def setup(self):
+        """Create the test genome, test fastq and test SNP files"""
+        if not os.path.exists(self.data_dir):
+            os.makedirs(self.data_dir)
+        self.write_ref_genome()
+        self.write_fastqs()
+        self.write_snps()
+        self.write_h5_files()
+
+
+
+    def cleanup(self):
+        """remove files created by the tests"""
+        filenames = [
+            self.genome_filename,
+            self.fastq1_filename,
+            self.fastq2_filename,
+            self.sam_filename,
+            self.bam_filename,
+            self.bam_sort_filename,
+            self.bam_keep_filename,
+            self.bam_remap_filename,
+            self.fastq_remap_filename,
+            self.fastq1_remap_filename,
+            self.fastq2_remap_filename,
+            self.snp_index_filename,
+            self.snp_tab_filename,
+            self.haplotype_filename]
+
+        index_filenames = glob.glob(self.genome_prefix + "*.bt2")
+        filenames.extend(index_filenames)
+
+        snp_filenames = glob.glob(self.snp_dir + "/*.snps.txt.gz")
+        filenames.extend(snp_filenames)
+
+        for fname in filenames:
+            if os.path.exists(fname):
+                os.remove(fname)
+
+        if os.path.exists(self.snp_dir):
+            os.rmdir(self.snp_dir)
+
+
+    def write_ref_genome(self):
+        f = open(self.genome_filename, "w")
+        for chrom_name, genome_seq in zip(self.chrom_names, self.genome_seqs):
+            f.write(">" + chrom_name + "\n" + genome_seq + "\n")
+        f.close()
+
+
+    def write_fastqs(self):        
+        if self.read1_seqs:
+            # write fastq1
+            
+            if self.read1_names is None:
+                names = ["read%d" % (x+1) for x in range(len(self.read1_seqs))]
+            else:
+                names = self.read1_names
+
+            f = open(self.fastq1_filename, "w")
+            i = 0
+            for seq_str, qual_str in zip(self.read1_seqs, self.read1_quals):
+                f.write("@%s\n" % names[i])
+                f.write(seq_str + "\n")
+                f.write("+%s\n" % names[i])
+                f.write(qual_str + "\n")
+                i += 1
+                
+            f.close()
+
+        if self.read2_seqs:
+            if self.read2_names is None:
+                names = ["read%d" % (x+1) for x in range(len(self.read2_seqs))]
+            else:
+                names = self.read2_names
+
+            f = open(self.fastq2_filename, "w")
+            i = 0
+            for seq_str, qual_str in zip(self.read2_seqs, self.read2_quals):
+                f.write("@%s\n" % names[i])
+                f.write(seq_str + "\n")
+                f.write("+%s\n" % names[i])
+                f.write(qual_str + "\n")
+                i += 1
+            f.close()
+
+
+
+    def index_genome_bowtie2(self):
+        cmd = ['bowtie2-build', self.genome_filename, self.genome_prefix]
+        # write stderr and stdout to /dev/null because bowtie2-build writes a
+        # lot of output
+        fnull = open(os.devnull, 'w')
+        subprocess.check_call(cmd, stdout=fnull, stderr=subprocess.STDOUT)
+
+
+    def map_single_bowtie2(self):
+        cmd = ['bowtie2', '-x', self.genome_prefix, "-U", self.fastq1_filename,
+               "-S", self.sam_filename]
+        subprocess.check_call(cmd)
+
+
+    def map_paired_bowtie2(self):
+        cmd = ['bowtie2', '-x', self.genome_prefix, "-1", self.fastq1_filename,
+               "-2", self.fastq2_filename, "-S", self.sam_filename]
+        subprocess.check_call(cmd)
+
+
+
+    def sam2bam(self):
+        cmd = 'samtools view -S -b %s > %s' % \
+              (self.sam_filename, self.bam_filename)
+        subprocess.check_call(cmd, shell=True)
+
+
+    def write_snps(self):
+        files = {}
+
+        if not os.path.exists(self.snp_dir):
+            os.makedirs(self.snp_dir)
+
+        for snp in self.snp_list:
+            chrom, pos, allele1, allele2 = snp
+            if chrom not in files:
+                filename = self.snp_dir + "/" + chrom + ".snps.txt.gz"
+                files[chrom] = gzip.open(filename, "wb")
+            files[chrom].write("%d\t%s\t%s\n" % (pos, allele1, allele2))
+
+        for f in files.values():
+            f.close()
+
+
+
+    def write_hap_samples(self, h5f):
+        """Write tables containing sample names to HDF5 file"""
+        class SamplesTab(tables.IsDescription):
+            name = tables.StringCol(64)
+
+        for chrom_name in self.chrom_names:
+            table = h5f.createTable(h5f.root, "samples_%s" % chrom_name,
+                                    SamplesTab)
+
+            for samp in self.hap_samples:
+                row = table.row
+                row['name'] = samp
+                row.append()
+        
+        
+            
+    def write_snp_tab_h5(self):        
+        snp_tab_h5 = tables.openFile(self.snp_tab_filename, "w")
+
+        class SNPTab(tables.IsDescription):
+            name = tables.StringCol(16)
+            pos = tables.Int64Col()
+            allele1 = tables.StringCol(100)
+            allele2 = tables.StringCol(100)
+
+        chrom_tables = {}
+        snp_num = 0
+        for snp in self.snp_list:
+            if snp[0] in chrom_tables:
+                table = chrom_tables[snp[0]]
+            else:
+                table = snp_tab_h5.createTable(snp_tab_h5.root, snp[0], SNPTab)
+                chrom_tables[snp[0]] = table
+
+            row = table.row
+            snp_num += 1
+            row['name'] = "snp%d" % snp_num
+            row['pos'] = snp[1]
+            row['allele1'] = snp[2]
+            row['allele2'] = snp[3]
+            row.append()
+
+        self.write_hap_samples(snp_tab_h5)
+        
+        snp_tab_h5.close()
+
+        
+    def get_chrom_lengths(self):
+        chrom_lengths = {}
+        
+        for chrom_name, genome_seq in zip(self.chrom_names, self.genome_seqs):
+            genome_seq = genome_seq.replace("\n", "").replace(" ", "")
+            chrom_lengths[chrom_name] = len(genome_seq)
+
+        return chrom_lengths
+            
+        
+
+    def write_haplotype_h5(self):
+        chrom_lengths = self.get_chrom_lengths()
+        
+        atom = tables.Int8Atom(dflt=0)
+        zlib_filter = tables.Filters(complevel=1, complib="zlib")
+        
+        hap_h5 = tables.openFile(self.haplotype_filename, "w")    
+
+        chrom_haps = {}
+        snp_index = 0
+
+        # group haplotypes by chromosome
+        for snp, hap in zip(self.snp_list, self.haplotypes):
+            if snp[0] in chrom_haps:
+                chrom_haps[snp[0]].append(hap)
+            else:
+                chrom_haps[snp[0]] = [hap]
+        
+        for chrom, haps in chrom_haps.items():
+            hap_array = np.array(haps, dtype=np.int8)
+            carray = hap_h5.createCArray(hap_h5.root,
+                                         chrom, atom, hap_array.shape,
+                                         filters=zlib_filter)
+            carray[:] = haps
+            
+        self.write_hap_samples(hap_h5)
+
+        hap_h5.close()
+        
+            
+                
+    def write_snp_index_h5(self):
+        atom = tables.Int16Atom(dflt=0)
+        zlib_filter = tables.Filters(complevel=1, complib="zlib")
+        
+        snp_index_h5 = tables.openFile(self.snp_index_filename, "w")    
+
+        snp_index = 0
+
+        chrom_arrays = {}
+        chrom_lengths = self.get_chrom_lengths()
+        
+        for snp in self.snp_list:
+            if snp[0] in chrom_arrays:
+                carray = chrom_arrays[snp[0]]
+            else:
+                # create CArray for this chromosome
+                shape = [chrom_lengths[snp[0]]]
+                carray = snp_index_h5.createCArray(snp_index_h5.root,
+                                                   snp[0], atom, shape,
+                                                   filters=zlib_filter)
+                carray[:] = -1
+                chrom_arrays[snp[0]] = carray
+
+            pos = snp[1]
+            carray[pos-1] = snp_index
+            snp_index += 1
+            
+        self.write_hap_samples(snp_index_h5)
+
+        snp_index_h5.close()
+
+
+    def write_h5_files(self):
+        self.write_snp_tab_h5()
+        self.write_snp_index_h5()
+        self.write_haplotype_h5()
+        
+            
+
+
+
+
+class TestSingleEnd:
+    """tests for single end read mapping"""
+
+
+    def test_single_one_read_one_snp(self):
+        """Simple test of whether 1 read overlapping
+        1 SNP works correctly"""
+        test_data = Data()
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_dir=test_data.snp_dir)
+
+        #
+        # Verify new fastq is correct. The first base of the first read
+        # should be switched from a C to an A.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 4
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq = "".join(l)
+
+        assert lines[1] == new_seq
+        assert lines[3] == test_data.read1_quals[0]
+
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+
+    def test_single_two_read_two_snp_two_chrom(self):
+        """Test whether having two chromosomes works, with reads
+        and SNPs on both works correctly"""
+                        
+        test_data = Data(read1_seqs = ["AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+                                       "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"],
+                         read1_quals = ["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB",
+                                        "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                         genome_seqs = ["AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n" +
+                                         "TTTTTTTTTTATTTTTTTTTTTTTTTTTTT",
+                                         "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n" +
+                                         "CCCCCCCCCCGCCCCCCCCCCCCCCCCCCC"],
+                         chrom_names = ['test_chrom1', 'test_chrom2'],
+                         snp_list = [['test_chrom1', 1, "A", "C"],
+                                     ['test_chrom2', 3, "G", "C"]])
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir, is_paired_end=False,
+                                    is_sorted=False)
+
+        #
+        # Verify new fastq is correct. The first base of the first read
+        # should be switched from C to an A, and the third base of second read
+        # should be switched from C to G
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 8
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq = "".join(l)
+        assert lines[1] == new_seq
+        assert lines[3] == test_data.read1_quals[0]
+
+        l = list(test_data.read1_seqs[1])
+        l[2] = 'C'
+        new_seq = "".join(l)
+
+        assert lines[5] == new_seq
+        assert lines[7] == test_data.read1_quals[1]
+        
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+        
+
+    def test_single_gapD_read_two_snps(self):
+        """Test whether read with D in alignment works correctly"""
+        #
+        # Currently WASP discards reads that overlap
+        # indels. We want to improve WASP to handle indels
+        # correctly, but in the meantime this test just checks
+        # that the read is discarded.
+        #
+
+        # create read that will align with a 'D' in CIGAR string
+        test_data = Data(read1_seqs =  ["ACTGACTGAAACTGACTGACTGACTGACTTTTTTTTTTATTTTTTTTTTTTTTTTTTT"],
+                         read1_quals = ["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                         genome_seqs = ["ACTGACTGAAAAACTGACTGACTGACTGAC\n" +
+                                        "TTTTTTTTTTATTTTTTTTTTTTTTTTTTT"],
+                         chrom_names = ['test_chrom1'],
+                         snp_list = [['test_chrom1', 1, "A", "C"],
+                                     ['test_chrom1', 15, "T", "C"]])
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+        
+        # pos:     123456789012345678901234567890
+        # genome:  ACTGACTGAAAAACTGACTGACTGACTGACTTTTTTTTTTATTTTTTTTTTTTTTTTTTT
+        # snps:    ^             ^
+        # read:    ACTGACTGAAA--CTGACTGACTGACTGACTTTTTTTTTTATTTTTTTTTTTTTTTTTTT
+        # read_pos:12345678901--23456789012345678
+        
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir,
+                                    is_paired_end=False, is_sorted=False)
+
+
+        #
+        # Verify new fastq is correct. There should be 3 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 12
+
+        seqs = [lines[1], lines[5], lines[9]]
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[12] = 'C'
+        new_seq2 = "".join(l)
+
+        # read with both non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[12] = 'C'
+        new_seq3 = "".join(l)
+
+        assert len(seqs) == 3
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+
+        # Check the new reads are named correctly
+        assert lines[0] == "@read1.1.1.3"
+        assert lines[4] == "@read1.1.2.3"
+        assert lines[8] == "@read1.1.3.3"
+        
+        # Verify to.remap bam is the same as the input bam file.
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+
     
-    def test_add_allele_multiple(self):
-        """Test to see whether we can add multiple alleles."""
-        snp = SNP('12670\tG\tC\n')
-        snp.add_allele(['A', 'T'])
-        assert snp.alleles == ['G', 'C', 'A', 'T']
-
-    # TODO: tests for adding insertions and deletions
-
-class TestBamScanner:
-    def test_init_single(self):
-        is_paired_end = False
-        max_window = 100000
-        pref = 'test_data/test_single'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        assert bs.max_window == len(bs.snp_table)
-        cleanup()
-
-    def test_init_paired(self):
-        is_paired_end = True
-        max_window = 100000
-        pref = 'test_data/test_paired'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq1.gz",
-                       pref + ".remap.fq2.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        assert bs.max_window == len(bs.snp_table)
-        cleanup()
-
-    def test_simple_single(self):
-        is_paired_end = False
-        max_window = 100000
-        pref = 'test_data/test_single'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-
-        # Verify fastq is correct. The second base of the first read should be
-        # switched from a C to an A.  (14538   C   A)
-        seq = ('CATCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        qual = ('BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
-                'FFFFFBFF<FFFFFFFFFBFFFFFFFFFFFFFFFFFFF')
-        with gzip.open('test_data/test_single.remap.fq.gz') as f:
+    def test_single_gapI_read_two_snps(self):
+        """Test whether read with I in alignment works correctly"""
+        #
+        # Currently WASP discards reads that overlap
+        # indels. We want to improve WASP to handle indels
+        # correctly, but in the meantime this test just checks
+        # that the read is discarded.
+        #
+
+        # create read that will align with a 'D' in CIGAR string
+        test_data = Data(read1_seqs =  ["ACTGACTGAAAAAAACTGACTGACTGACTGACTTTTTTTTTTATTTTTTTTTTTT"],
+                         read1_quals = ["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                         genome_seqs = ["ACTGACTGAAAAACTGACTGACTGACTGAC\n" +
+                                        "TTTTTTTTTTATTTTTTTTTTTTTTTTTTT"],
+                         chrom_names = ['test_chrom1'],
+                         snp_list = [['test_chrom1', 1, "A", "C"],
+                                     ['test_chrom1', 15, "T", "C"]])
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+        
+        # pos:     1234567890123--45678901234567890
+        # genome:  ACTGACTGAAAAA--CTGACTGACTGACTGACTTTTTTTTTTATTTTTTTTTTTTTTTTTTT
+        # snps:    ^               ^
+        # read:    ACTGACTGAAAAAAACTGACTGACTGACTGACTTTTTTTTTTATTTTTTTTTTTTTTTTTTT
+        # read_pos:1234567890123456789012345678
+        
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir,
+                                    is_paired_end=False, is_sorted=False)
+
+
+        #
+        # Verify new fastq is correct. There should be 3 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 12
+
+        seqs = [lines[1], lines[5], lines[9]]
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[16] = 'C'
+        new_seq2 = "".join(l)
+
+        # read with both non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[16] = 'C'
+        new_seq3 = "".join(l)
+
+        assert len(seqs) == 3
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+
+        # Check the new reads are named correctly
+        assert lines[0] == "@read1.1.1.3"
+        assert lines[4] == "@read1.1.2.3"
+        assert lines[8] == "@read1.1.3.3"
+        
+        # Verify to.remap bam is the same as the input bam file.
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+
+    def test_single_one_read_one_indel(self):
+        """Test whether 1 read overlapping indel works correctly"""
+
+        test_data = Data(snp_list=[['test_chrom', 2, "A", "CCC"]])
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir,
+                                    is_paired_end=False, is_sorted=False)
+
+        #
+        # Verify new fastq is correct. Should be empty because only
+        # read overlaps an indel
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 0
+
+
+        #
+        # Verify to.remap bam is empty
+        #
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert len(new_lines) == 1
+        assert new_lines[0] == ""
+
+        #
+        # Verify that the keep file is empty.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+
+
+        
+    def test_single_two_reads_one_snp(self):
+        """Test whether 2 reads (one overlapping SNP,
+        one not overlapping SNP) works correctly"""
+
+        read1_seqs = ["AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+                      "TTTTTTTTTTATTTTTTTTTTTTTTTTTTT"]
+        read1_quals = ["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB",
+                       "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"]
+
+        test_data = Data(read1_seqs=read1_seqs,
+                         read1_quals=read1_quals)
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir, 
+                                    is_paired_end=False, is_sorted=False)
+
+
+        #
+        # Verify new fastq is correct. It should only contain
+        # a single read and, the first base of the first read
+        # should be switched from a C to an A.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+
+        assert len(lines) == 4
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq = "".join(l)
+        assert lines[1] == new_seq
+        assert lines[3] == test_data.read1_quals[0]
+
+        # Verify first line in TO.REMAP bam is the same as the first
+        # line in the input bam file.
+        old_lines = read_bam(test_data.bam_filename)
+
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert len(new_lines) == 1
+        assert old_lines[0] == new_lines[0]
+
+        # verify that first line in the KEEP bam is the same as the second
+        # line in the input bam file
+        new_lines = read_bam(test_data.bam_keep_filename)
+        assert len(new_lines) == 1
+        assert old_lines[1] == new_lines[0]
+
+
+    def test_single_one_read_two_snps(self):
+        """Test whether 1 read overlapping 2 SNPs works correctly"""
+        test_data = Data(snp_list = [['test_chrom', 1, "A", "C"],
+                                     ['test_chrom', 4, "A", "G"]])
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir,
+                                    is_paired_end=False, is_sorted=False)
+
+        #
+        # Verify new fastq is correct. There should be 3 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 12
+
+        seqs = [lines[1], lines[5], lines[9]]
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[3] = 'G'
+        new_seq2 = "".join(l)
+
+        # read with both non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'G'
+        new_seq3 = "".join(l)
+
+        assert len(seqs) == 3
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+
+
+        #
+        # Check the new reads are named correctly
+        #
+        assert lines[0] == "@read1.1.1.3"
+        assert lines[4] == "@read1.1.2.3"
+        assert lines[8] == "@read1.1.3.3"
+        
+        
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+    def test_single_one_read_ten_snps(self):
+        """Test whether 1 read overlapping 10 SNPs works correctly"""
+
+        snp_list = [['test_chrom', x, "A", "C"] for x in range(1, 11)]
+
+
+        test_data = Data(snp_list=snp_list)
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir,
+                                    is_paired_end=False, is_sorted=False,
+                                    max_seqs=10)
+
+        #
+        # Verify new fastq is correct. There should be no reads,
+        # because reads with greater than 10 allelic combinations
+        # are thrown out
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 0
+
+        #
+        # Verify to.remap bam is empty
+        #
+        lines = read_bam(test_data.bam_remap_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        #
+        # re-run find intersecting SNPs but allow a max of 1024
+        # allelic combinations (we expect 1023 new seqs with 10
+        # bi-allelic SNPs)
+        #
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir, 
+                                    is_paired_end=False, is_sorted=False,
+                                    max_snps=10,
+                                    max_seqs=1024)
+
+        #
+        # Verify new fastq is correct. There should be 1023 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 4*1023
+
+        # get every 4th line, which correspond to sequences starting at line 1
+        seqs = [lines[x] for x in range(1, len(lines), 4)]
+
+        # test a few combinations of alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[3] = 'C'
+        new_seq2 = "".join(l)
+
+        # read with 3 non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'C'
+        l[9] = 'C'
+        new_seq3 = "".join(l)
+
+        # read with 10 non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        for i in range(10):
+            l[i] = 'C'
+        new_seq4 = "".join(l)
+
+        assert len(seqs) == 1023
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+        assert new_seq4 in seqs
+
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+
+class TestCLI:
+    def test_single_cli(self):
+        """Make sure the command line interface
+        for single-end read mapping works"""
+
+        test_data = Data()
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        command = ['python', 'find_intersecting_snps.py',
+                   test_data.bam_filename, '--snp_dir', test_data.snp_dir]
+        subprocess.check_call(command)
+
+        # Verify new fastq is correct. The first base of the first read
+        # should be switched from a C to an A.
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 4
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq = "".join(l)
+
+        assert lines[1] == new_seq
+        assert lines[3] == test_data.read1_quals[0]
+
+        # Verify first line in to.remap bam is the same as the first
+        # line in the input bam file.
+        old_lines = read_bam(test_data.bam_filename)
+
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert len(new_lines) == 1
+        assert old_lines[0] == new_lines[0]
+
+        test_data.cleanup()
+        
+
+    def test_new_output_dir_cli(self):
+        """Make sure files can be written to directory of choice"""
+        out_dir = "test_output_dir"
+        if not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        
+        test_data = Data(
+            prefix="test_data/test",
+            output_prefix=out_dir + "/test")
+        test_data.setup()
+        
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        command = ['python', 'find_intersecting_snps.py',
+                   "--output_dir", out_dir,
+                   test_data.bam_filename, "--snp_dir",
+                   test_data.snp_dir]
+        subprocess.check_call(command)
+
+        # verify that all output files written to output dir
+        out_files = ["test_output_dir/test.keep.bam",
+                     "test_output_dir/test.to.remap.bam",
+                     "test_output_dir/test.remap.fq.gz"]
+        
+        for out_file in out_files:
+            sys.stderr.write("%s\n"%out_file)
+            assert(os.path.exists(out_file))
+            os.remove(out_file)
+
+        os.remove("test_output_dir/test.sort.bam")
+        os.rmdir(out_dir)
+
+
+
+    def test_single_cli_haplotypes_samples(self):
+        ##
+        ## Test with all possible combinations of haplotypes
+        ## present in data
+        ##
+        test_data = Data(snp_list = [['test_chrom', 1, "A", "C"],
+                                     ['test_chrom', 4, "G", "A"]],
+                         read1_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         read1_quals=["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                         genome_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         haplotypes=[[1,0,1,0],
+                                     [1,0,0,1]],
+                         hap_samples=["samp1", "samp2"])
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        command = ['python', 'find_intersecting_snps.py',
+                   test_data.bam_filename, 
+                   '--haplotype', test_data.haplotype_filename,
+                   '--snp_tab', test_data.snp_tab_filename,
+                   '--snp_index', test_data.snp_index_filename,
+                   '--samples', 'samp1,samp2']
+        
+        subprocess.check_call(command)
+
+        # Verify new fastq is correct. There should be 3 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 12
+
+        seqs = [lines[1], lines[5], lines[9]]
+        sys.stderr.write("SEQS: %s\n" % repr(seqs))
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[3] = 'A'
+        new_seq2 = "".join(l)
+
+        # read with both non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'A'
+        new_seq3 = "".join(l)
+
+        assert len(seqs) == 3
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+
+
+        # Check the new reads are named correctly
+        assert lines[0] == "@read1.1.1.3"
+        assert lines[4] == "@read1.1.2.3"
+        assert lines[8] == "@read1.1.3.3"
+        
+
+        # Verify to.remap bam is the same as the input bam file.
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+        # repeat test, but use samples file instead of
+        # sample names on command line
+        hap_samples = ["samp1", "samp2"]
+        test_data = Data(snp_list = [['test_chrom', 1, "A", "C"],
+                                     ['test_chrom', 4, "G", "A"]],
+                         read1_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         read1_quals=["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                         genome_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         haplotypes=[[1,0,1,0],
+                                     [1,0,0,1]],
+                         hap_samples=hap_samples)
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        samp_filename = test_data.data_dir + "/samples.txt"
+        f = open(samp_filename, 'w')
+        for samp in hap_samples:
+            f.write("%s testdata\n" % samp)
+
+        f.close()
+        command = ['python', 'find_intersecting_snps.py',
+                   test_data.bam_filename, 
+                   '--haplotype', test_data.haplotype_filename,
+                   '--snp_tab', test_data.snp_tab_filename,
+                   '--snp_index', test_data.snp_index_filename,
+                   '--samples', samp_filename]
+        
+        subprocess.check_call(command)
+        
+        # Verify new fastq is correct. There should be 3 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 12
+
+        seqs = [lines[1], lines[5], lines[9]]
+        sys.stderr.write("SEQS: %s\n" % repr(seqs))
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[3] = 'A'
+        new_seq2 = "".join(l)
+
+        # read with both non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'A'
+        new_seq3 = "".join(l)
+
+        assert len(seqs) == 3
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+
+        os.unlink(samp_filename)
+        test_data.cleanup()
+
+
+
+
+
+
+class TestPairedEnd:
+    """tests for paired end read mapping"""
+
+    def test_paired_two_reads_one_snp(self):
+        """Simple test of whether 1 PE read with one end overlapping
+        1 SNP works correctly"""
+        test_data = Data()
+
+        read1_seqs = ["AACGAAAAGGAGAA",
+                      "AAAAAAATTTAAAA"]
+        read2_seqs = ["AAGAAACAACACAA",
+                      "AAGAAACAACACAA"]
+        
+        read1_quals = ["B" * len(read1_seqs[0]),
+                       "C" * len(read1_seqs[1])]
+        read2_quals = ["D" * len(read2_seqs[0]),
+                       "E" * len(read2_seqs[1])]
+
+        # POS           123456789012345678901234567890
+        # read1[0]          AACGAAAAGGAGAA
+        # read1[1]                      AAAAAAATTTAAAA
+        # SNP                            ^
+        genome_seq =  ["AAAAAACGAAAAGGAGAAAAAAATTTAAAA\n"
+                       "TTTATTTTTTATTTTTTTGTGTTGTTTCTT"]
+        # read2[0]                      AACACAACAAAGAA
+        # read2[1]                      AACACAACAAAGAA
+        # POS           123456789012345678901234567890
+        
+        snp_list = [['test_chrom', 18, "A", "C"]]
+        
+        test_data = Data(genome_seqs=genome_seq,
+                         read1_seqs=read1_seqs,
+                         read2_seqs=read2_seqs,
+                         read1_quals=read1_quals,
+                         read2_quals=read2_quals,
+                         snp_list=snp_list)
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_paired_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir, 
+                                    is_paired_end=True, is_sorted=False)
+
+        #
+        # Verify new fastq1 is correct.
+        #
+        with gzip.open(test_data.fastq1_remap_filename) as f:
             lines = [x.strip() for x in f.readlines()]
-        assert len(lines) == 4
-        assert lines[1] == seq
-        assert lines[3] == qual
+        assert len(lines) == 8
+
+        l = list(test_data.read1_seqs[0])
+        # last base of first read should be changed from A to C
+        l[13] = 'C'
+        new_seq = "".join(l)
+        assert lines[1] == new_seq
+        assert lines[3] == test_data.read1_quals[0]
+
+        # second base of second read should be changed from A to C
+        l = list(test_data.read1_seqs[1])
+        l[1] = "C"
+        new_seq = "".join(l)
+        assert(lines[5] == new_seq)
+        assert(lines[7] == test_data.read1_quals[1])
 
+        #
+        # verify fastq2 is correct
+        #
+        with gzip.open(test_data.fastq2_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 8
+
+        # bases should be the same for the second half of
+        # the reads, since no SNP overlap
+        assert lines[1] == test_data.read2_seqs[0]
+        assert lines[3] == test_data.read2_quals[0]
+        assert lines[5] == test_data.read2_seqs[1]
+        assert lines[7] == test_data.read2_quals[1]
+        
+        #
         # Verify to.remap bam is the same as the input bam file.
-        old_lines = read_bam('test_data/test_single.sort.bam')
-        new_lines = read_bam('test_data/test_single.to.remap.bam')
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
         assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+        
+    def test_paired_two_interleaved_reads_one_snp(self):
+        """Test whether PE reads still work correctly 
+        when read pairs are interleaved"""
+        test_data = Data()
+
+        read1_seqs = ["AACGAAAAGGAGAA",
+                      "AAAAAAATTTAAAA"]
+        read2_seqs = ["AAGAAACAACACAA",
+                      "AAAAATAAAAAATA"]
+        
+        read1_quals = ["B" * len(read1_seqs[0]),
+                       "C" * len(read1_seqs[1])]
+        read2_quals = ["D" * len(read2_seqs[0]),
+                       "E" * len(read2_seqs[1])]
+
+        # POS           123456789012345678901234567890
+        # read1[0]          AACGAAAAGGAGAA
+        # read1[1]                      AAAAAAATTTAAAA
+        # SNP                            ^
+        genome_seq =  ["AAAAAACGAAAAGGAGAAAAAAATTTAAAA\n"
+                       "TTTATTTTTTATTTTTTTGTGTTGTTTCTT"]
+        # read2[0]                      AACACAACAAAGAA
+        # read2[1]        ATAAAAAATAAAAA
+        # POS           123456789012345678901234567890
+        
+        snp_list = [['test_chrom', 18, "A", "C"]]
+        
+        test_data = Data(genome_seqs=genome_seq,
+                         read1_seqs=read1_seqs,
+                         read2_seqs=read2_seqs,
+                         read1_quals=read1_quals,
+                         read2_quals=read2_quals,
+                         snp_list=snp_list)
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_paired_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir, 
+                                    is_paired_end=True, is_sorted=False)
+
+        expect_reads = set([("AACGAAAAGGAGAC", "AAGAAACAACACAA"),
+                            ("ACAAAAATTTAAAA", "AAAAATAAAAAATA")])
+
+        #
+        # Verify fastq1 and fastq2 have appropriate read pairs
+        #
+        with gzip.open(test_data.fastq1_remap_filename) as f:
+            lines1 = [x.strip() for x in f.readlines()]
+        assert len(lines1) == len(expect_reads) * 4
+
+        with gzip.open(test_data.fastq2_remap_filename) as f:
+            lines2 = [x.strip() for x in f.readlines()]
+
+        # should be same number of lines in each file
+        assert len(lines1) == len(lines2)
+
+        # number of read records (4 lines each) should match expectation
+        assert len(lines2) == len(expect_reads) * 4
+        
+        for i in range(1, len(lines1), 4):
+            read_pair = (lines1[i], lines2[i])
+            assert read_pair in expect_reads
+            sys.stderr.write("removing pair: %s\n" % repr(read_pair))
+            expect_reads.remove(read_pair)
+
+        assert len(expect_reads) == 0
+        
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+
+   
+    def test_paired_two_reads_one_indel(self):
+        """Test of whether 2 PE reads with one end of one read
+        overlapping indel works correctly"""
+        test_data = Data()
+
+        read1_seqs = ["AACGAAAAGGAGAA",
+                      "AAAAAAATTTAAAA"]
+        read2_seqs = ["AAGAAACAACACAA",
+                      "AAAAATAAAAAATA"]
+        
+        read1_quals = ["B" * len(read1_seqs[0]),
+                       "C" * len(read1_seqs[1])]
+        read2_quals = ["D" * len(read2_seqs[0]),
+                       "E" * len(read2_seqs[1])]
+
+        #                       10        20        30
+        # POS           123456789012345678901234567890
+        # read1[0]          AACGAAAAGGAGAA
+        # read1[1]                      AAAAAAATTTAAAA
+        # SNP                            ^
+        genome_seq =  ["AAAAAACGAAAAGGAGAAAAAAATTTAAAA\n"
+                       "TTTATTTTTTATTTTTTTGTGTTGTTTCTT"]
+        # read2[0]                      AACACAACAAAGAA
+        # read2[1]        ATAAAAAATAAAAA
+        # INDEL                              ^         
+        # POS           123456789012345678901234567890
+        #                       40        50
+        
+        snp_list = [['test_chrom', 18, "A", "C"],
+                    ['test_chrom', 52, "G", "GTTA"]]
+        
+        test_data = Data(genome_seqs=genome_seq,
+                         read1_seqs=read1_seqs,
+                         read2_seqs=read2_seqs,
+                         read1_quals=read1_quals,
+                         read2_quals=read2_quals,
+                         snp_list=snp_list)
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_paired_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir, 
+                                    is_paired_end=True, is_sorted=False)
+
+        # Currently reads overlapping indels are thrown out
+        expect_reads = set([("ACAAAAATTTAAAA", "AAAAATAAAAAATA")])
+
+        #
+        # Verify fastq1 and fastq2 have appropriate read pairs
+        #
+        with gzip.open(test_data.fastq1_remap_filename) as f:
+            lines1 = [x.strip() for x in f.readlines()]
+        assert len(lines1) == len(expect_reads) * 4
+
+        with gzip.open(test_data.fastq2_remap_filename) as f:
+            lines2 = [x.strip() for x in f.readlines()]
+        assert len(lines2) == len(expect_reads) * 4
+                           
+        for i in range(1, len(lines2), 4):
+            read_pair = (lines1[i], lines2[i])
+            assert read_pair in expect_reads
+            expect_reads.remove(read_pair)
+
+        assert len(expect_reads) == 0
+
+   
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+        
+
+    def test_paired_two_reads_two_snps(self):
+        """Test of whether 2 PE reads with both ends overlapping
+        SNPs work correctly"""
+        test_data = Data()
+
+        read1_seqs = ["AACGAAAAGGAGAA",
+                      "AAAAAAATTTAAAA"]
+        read2_seqs = ["AAGAAACAACACAA",
+                      "AAAAATAAAAAATA"]
+        
+        read1_quals = ["B" * len(read1_seqs[0]),
+                       "C" * len(read1_seqs[1])]
+        read2_quals = ["D" * len(read2_seqs[0]),
+                       "E" * len(read2_seqs[1])]
+
+        #                       10        20        30
+        # POS           123456789012345678901234567890
+        # read1[0]          AACGAAAAGGAGAA
+        # read1[1]                      AAAAAAATTTAAAA
+        # SNP                            ^
+        genome_seq =  ["AAAAAACGAAAAGGAGAAAAAAATTTAAAA\n"
+                       "TTTATTTTTTATTTTTTTGTGTTGTTTCTT"]
+        # read2[0]                      AACACAACAAAGAA
+        # read2[1]        ATAAAAAATAAAAA
+        # SNP                   ^           ^         
+        # POS           123456789012345678901234567890
+        #                       40        50
+        
+        snp_list = [['test_chrom', 18, "A", "C"],
+                    ['test_chrom', 39, "T", "G"],
+                    ['test_chrom', 51, "G", "T"]]
+        
+        test_data = Data(genome_seqs=genome_seq,
+                         read1_seqs=read1_seqs,
+                         read2_seqs=read2_seqs,
+                         read1_quals=read1_quals,
+                         read2_quals=read2_quals,
+                         snp_list=snp_list)
         
-        cleanup()
-
-    def test_simple_paired(self):
-        is_paired_end = True
-        max_window = 100000
-        pref = 'test_data/test_paired'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq1.gz",
-                       pref + ".remap.fq2.gz"]
-        snp_dir = 'test_data/snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-        # The second base should be switched from a C to an A.
-        # 14538   C   A
-        seq = ('CATCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        qual = ('BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
-                'FFFFFBFF<FFFFFFFFFBFFFFFFFFFFFFFFFFFFF')
-        with gzip.open('test_data/test_paired.remap.fq1.gz') as f:
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_paired_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_dir=test_data.snp_dir, 
+                                    is_paired_end=True, is_sorted=False)
+
+        expect_reads = set([("AACGAAAAGGAGAC", "AAGAAACAACACAA"),
+                            ("AACGAAAAGGAGAC", "AAGAAACAAAACAA"),
+                            ("AACGAAAAGGAGAA", "AAGAAACAAAACAA"),
+                            ("ACAAAAATTTAAAA", "AAAAATAAAAAATA"),
+                            ("ACAAAAATTTAAAA", "AAAAATACAAAATA"),
+                            ("AAAAAAATTTAAAA", "AAAAATACAAAATA")])
+
+        #
+        # Verify fastq1 and fastq2 have appropriate read pairs
+        #
+        with gzip.open(test_data.fastq1_remap_filename) as f:
+            lines1 = [x.strip() for x in f.readlines()]
+        assert len(lines1) == len(expect_reads) * 4
+
+        with gzip.open(test_data.fastq2_remap_filename) as f:
+            lines2 = [x.strip() for x in f.readlines()]
+        assert len(lines2) == len(expect_reads) * 4
+        for i in range(1, len(lines2), 4):
+            read_pair = (lines1[i], lines2[i])
+            assert read_pair in expect_reads
+            expect_reads.remove(read_pair)
+
+        assert len(expect_reads) == 0
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
+        # TODO: test when only one half of read maps
+
+
+
+
+
+## 
+## TODO: test haplotypes when list of samples provided...
+##
+
+class TestHaplotypesSingleEnd:
+    """tests for single end read mapping, using known haplotypes"""
+
+    def test_haplotypes_single_one_read_one_snp(self):
+        """Simple test of whether 1 read overlapping
+        1 SNP works correctly"""
+        test_data = Data()
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename)
+                                    
+
+        #
+        # Verify new fastq is correct. The first base of the first read
+        # should be switched from a C to an A.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
             lines = [x.strip() for x in f.readlines()]
         assert len(lines) == 4
-        assert lines[1] == seq
-        assert lines[3] == qual
-
-        # Shouldn't be any changes to the second read.
-        seq = ('TCATGGAGCCCCCTACGATTCCCAGTCGTCCTCGTCCTCCTCTGCCTGTGGCTGCTGCGGTGG'
-               'CGGCAGAGGAGGGATGGAGTCTGACACGCGGGCAAAG')
-        qual = ('B//FF77BB<7/BB<7FBFFF<</FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
-                'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBBBBB')
-        with gzip.open('test_data/test_paired.remap.fq2.gz') as f:
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq = "".join(l)
+
+        assert lines[1] == new_seq
+        assert lines[3] == test_data.read1_quals[0]
+
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+
+        #
+        # repeat test, but with haoplotypes only containing non-reference allele
+        #
+        test_data = Data(haplotypes=[[1,1,1,1]])
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename)
+                                    
+        #
+        # Verify new fastq is correct. The first base of the first read
+        # should be switched from a C to an A.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
             lines = [x.strip() for x in f.readlines()]
         assert len(lines) == 4
-        assert lines[1] == bs.reverse_complement(seq)
-        assert lines[3] == qual
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq = "".join(l)
+
+        assert lines[1] == new_seq
+        assert lines[3] == test_data.read1_quals[0]
+
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
         
+        #
+        # repeat test, but with haplotypes only containing reference allele
+        #
+        test_data = Data(haplotypes=[[0,0,0,0]])
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename)
+                                    
+        #
+        # Verify new fastq is correct. There should not be any reads
+        # to remap since all haplotypes match reference
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 0
+
+        #
+        # Verify to.remap bam is empty
+        #
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert len(new_lines) == 1
+        assert new_lines[0] == ''
+
+        #
+        # Verify that the keep file contains
+        # the read since does not need to be remapped
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_keep_filename)
+        assert old_lines == new_lines
+
+
+        # Test with SNP at one before last position in read
+        # since there was a bug with this situation
+        test_data = Data(snp_list = [['test_chrom', 29, "A", "C"]],
+                         hap_samples = ["samp1", "samp2", "samp3", "samp4"],
+                         haplotypes = [[0, 1, 0, 1]])
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename)
+                                    
+
+        #
+        # Verify new fastq is correct. The last base of the first read
+        # should be switched from a C to an A.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 4
+
+        l = list(test_data.read1_seqs[0])
+        l[28] = 'C'
+        new_seq = "".join(l)
+
+        assert lines[1] == new_seq
+        assert lines[3] == test_data.read1_quals[0]
+
+        #
         # Verify to.remap bam is the same as the input bam file.
-        old_lines = read_bam('test_data/test_paired.sort.bam')
-        new_lines = read_bam('test_data/test_paired.to.remap.bam')
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
         assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        
+        test_data.cleanup()
+
+
+        
+    def test_haplotypes_single_one_read_two_snps(self):
+        """Test whether 1 read overlapping 2 SNPs works correctly"""
+
+        ##
+        ## Test with all possible combinations of haplotypes
+        ## present in data
+        ##
+        test_data = Data(snp_list = [['test_chrom', 1, "A", "C"],
+                                     ['test_chrom', 4, "G", "A"]],
+                         read1_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         read1_quals=["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                         genome_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         haplotypes=[[1,0,1,0],
+                                     [1,0,0,1]])
         
-        cleanup()
-
-    def test_two_snps_single(self):
-        is_paired_end = False
-        max_window = 100000
-        pref = 'test_data/test_single'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq.gz"]
-        snp_dir = 'test_data/two_snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-
-        # Verify fastq is correct. The second base of the first read should be
-        # switched from a C to an A and the third base of the first read should
-        # be switched from T to G. (14538   C   A, 14539    T   G)
-        with gzip.open('test_data/test_single.remap.fq.gz') as f:
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename)
+
+        #
+        # Verify new fastq is correct. There should be 3 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
             lines = [x.strip() for x in f.readlines()]
         assert len(lines) == 12
-        seq = ('CATCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        assert lines[1] == seq
-        seq = ('CCGCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        assert lines[5] == seq
-        seq = ('CAGCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        assert lines[9] == seq
-        qual = ('BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
-                'FFFFFBFF<FFFFFFFFFBFFFFFFFFFFFFFFFFFFF')
-        for i in [3, 7, 11]:
-            assert lines[i] == qual
 
+        seqs = [lines[1], lines[5], lines[9]]
+        sys.stderr.write("SEQS: %s\n" % repr(seqs))
+
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[3] = 'A'
+        new_seq2 = "".join(l)
+
+        # read with both non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'A'
+        new_seq3 = "".join(l)
+
+        assert len(seqs) == 3
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+
+
+        #
+        # Check the new reads are named correctly
+        #
+        assert lines[0] == "@read1.1.1.3"
+        assert lines[4] == "@read1.1.2.3"
+        assert lines[8] == "@read1.1.3.3"
+        
+        
+        #
         # Verify to.remap bam is the same as the input bam file.
-        old_lines = read_bam('test_data/test_single.sort.bam')
-        new_lines = read_bam('test_data/test_single.to.remap.bam')
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
         assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        ##
+        ## Test with subset of possible combinations present
+        ##
+        test_data = Data(snp_list = [['test_chrom', 1, "A", "C"],
+                                     ['test_chrom', 4, "A", "G"]],
+                         haplotypes=[[1,1,1,1,1,1],
+                                     [1,0,0,0,1,0]])
         
-        cleanup()
-
-    def test_two_snps_paired(self):
-        is_paired_end = True
-        max_window = 100000
-        pref = 'test_data/test_paired'
-        file_name = pref + ".sort.bam"
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq1.gz",
-                       pref + ".remap.fq2.gz"]
-        snp_dir = 'test_data/two_snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-        
-        # Verify fastq is correct. The second base of the first read should be
-        # switched from a C to an A and the third base of the first read should
-        # be switched from T to G. (14538   C   A, 14539    T   G)
-        with gzip.open('test_data/test_paired.remap.fq1.gz') as f:
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename)
+
+        #
+        # Verify new fastq is correct. There should be 2 reads
+        # with two haplotype configurations
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
             lines = [x.strip() for x in f.readlines()]
-        assert len(lines) == 12
-        seq = ('CATCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        assert lines[1] == seq
-        seq = ('CCGCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        assert lines[5] == seq
-        seq = ('CAGCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        assert lines[9] == seq
-        qual = ('BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
-                'FFFFFBFF<FFFFFFFFFBFFFFFFFFFFFFFFFFFFF')
-        for i in [3, 7, 11]:
-            assert lines[i] == qual
-
-        # Shouldn't be any changes to the second read.
-        seq = ('TCATGGAGCCCCCTACGATTCCCAGTCGTCCTCGTCCTCCTCTGCCTGTGGCTGCTGCGGTGG'
-               'CGGCAGAGGAGGGATGGAGTCTGACACGCGGGCAAAG')
-        qual = ('B//FF77BB<7/BB<7FBFFF<</FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
-                'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBBBBB')
-        with gzip.open('test_data/test_paired.remap.fq2.gz') as f:
+        assert len(lines) == 8
+
+        seqs = [lines[1], lines[5]]
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'G'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq2 = "".join(l)
+
+        assert len(seqs) == 2
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+
+        #
+        # Check the new reads are named correctly
+        #
+        assert lines[0] == "@read1.1.1.2"
+        assert lines[4] == "@read1.1.2.2"
+                
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+        
+        test_data.cleanup()
+
+
+
+    
+        
+    def test_haplotypes_single_one_read_two_snps_samples(self):
+        """Test whether 1 read overlapping 2 SNPs works correctly"""
+
+        ##
+        ## Test with all possible combinations of haplotypes
+        ## present in data
+        ##
+        test_data = Data(snp_list = [['test_chrom', 1, "A", "C"],
+                                     ['test_chrom', 4, "G", "A"]],
+                         read1_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         read1_quals=["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                         genome_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         haplotypes=[[1,0,1,0],
+                                     [1,0,0,1]],
+                         hap_samples=["samp1", "samp2"])
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename,
+                                    samples=["samp1", "samp2"])
+
+        #
+        # Verify new fastq is correct. There should be 3 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
             lines = [x.strip() for x in f.readlines()]
         assert len(lines) == 12
-        for i in [1, 5, 9]:
-            assert lines[i] == bs.reverse_complement(seq)
-            assert lines[i + 2] == qual
+
+        seqs = [lines[1], lines[5], lines[9]]
+        sys.stderr.write("SEQS: %s\n" % repr(seqs))
+
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[3] = 'A'
+        new_seq2 = "".join(l)
+
+        # read with both non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'A'
+        new_seq3 = "".join(l)
+
+        assert len(seqs) == 3
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+
+
+        #
+        # Check the new reads are named correctly
+        #
+        assert lines[0] == "@read1.1.1.3"
+        assert lines[4] == "@read1.1.2.3"
+        assert lines[8] == "@read1.1.3.3"
+        
         
+        #
         # Verify to.remap bam is the same as the input bam file.
-        old_lines = read_bam('test_data/test_paired.sort.bam')
-        new_lines = read_bam('test_data/test_paired.to.remap.bam')
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
         assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
+
         
-        cleanup()
-
-    def test_issue_18(self):
-        """
-        This was reported as a bug because one read pair that overlaps one SNP
-        was resulting in multiple pairs of reads in the fastq files. However, it
-        is not a bug because the reads overlap and both reads overlap the SNP.
-        """
-        is_paired_end = True
-        max_window = 100000
-        file_name = 'test_data/issue_18.bam'
-        pref = 'test_data/test_paired'
-        keep_file_name = pref + ".keep.bam"
-        remap_name = pref + ".to.remap.bam"
-        remap_num_name = pref + ".to.remap.num.gz"
-        fastq_names = [pref + ".remap.fq1.gz",
-                       pref + ".remap.fq2.gz"]
-        snp_dir = 'test_data/issue_18_snps'
-        bs = BamScanner(is_paired_end, max_window, file_name, keep_file_name,
-                        remap_name, remap_num_name, fastq_names, snp_dir)
-        bs.run()
-        
-        # Verify fastq are correct. 
-        with gzip.open('test_data/test_paired.remap.fq1.gz') as f:
-            lines = [x.strip() for x in f.readlines()]
-        assert len(lines) == 12
-        seq = ('CGAGCGCTCACTCAATATCACGAGAACAGCAAGGGGGAAGTCGGCCCCCANGAGCCAATNACC'
-               'TCCCANNNGGTCCCTCCCACAACACTGGGAATTACAA')
-        assert lines[1] == seq
-        seq = ('CGAGCGCTCACTCAATATCACAAGAACAGCAAGGGGGAAGTCGGCCCCCANGAGCCAATNACC'
-               'TCCCANNNGGTCCCTCCCACAACACTGGGAATTACAA')
-        assert lines[5] == seq
-        seq = ('CGAGCGCTCACTCAATATCACAAGAACAGCAAGGGGGAAGTCGGCCCCCANGAGCCAATNACC'
-               'TCCCANNNGGTCCCTCCCACAACACTGGGAATTACAA')
-        assert lines[9] == seq
-        qual = ('</<<<FBFFFFFFFFFFFBBFFFFFFFF/BFFFFF/BF<BBF//FBFFFB#<<<BFF//#<<'
-                'FBFBF/###<7<<FFFFFFFFFFFF<<B//B7F7BBFB')
-        for i in [3, 7, 11]:
-            assert lines[i] == qual
-
-        with gzip.open('test_data/test_paired.remap.fq2.gz') as f:
+        ##
+        ## Test with all possible combinations of haplotypes
+        ## present in data, but only running on one of two samples
+        ##
+        test_data = Data(snp_list = [['test_chrom', 1, "A", "C"],
+                                     ['test_chrom', 4, "G", "A"]],
+                         read1_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         read1_quals=["BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"],
+                         genome_seqs=["ACTGACTGACTGACTGACTGACTGACTGACTG"],
+                         haplotypes=[[1,0,1,0],
+                                     [1,0,0,1]],
+                         hap_samples=["samp1", "samp2"])
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        # run using only samp2 
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename,
+                                    samples=["samp2"])
+
+        # Verify new fastq is correct. There should be 2 reads
+        # representing both haplotypes from samp2
+        with gzip.open(test_data.fastq_remap_filename) as f:
             lines = [x.strip() for x in f.readlines()]
-        assert len(lines) == 12
-        seq = ('ATCACAAGAACAGCAAGGGGGAAGTCGGCCCCCATGAGCCAATCACCTCCCACCAGGTCCCTC'
-               'CCACAACACTGGGAATTACAATTTNACATNACATTTG')
-        assert lines[1] == bs.reverse_complement(seq)
-        seq = ('ATCACGAGAACAGCAAGGGGGAAGTCGGCCCCCATGAGCCAATCACCTCCCACCAGGTCCCTC'
-               'CCACAACACTGGGAATTACAATTTNACATNACATTTG')
-        assert lines[5] == bs.reverse_complement(seq)
-        seq = ('ATCACAAGAACAGCAAGGGGGAAGTCGGCCCCCATGAGCCAATCACCTCCCACCAGGTCCCTC'
-               'CCACAACACTGGGAATTACAATTTNACATNACATTTG')
-        assert lines[9] == bs.reverse_complement(seq)
-        qual = ('#B<BBFFFFFFFFFFFFFFFF<FFFFBFFFFFFFFF<F/FFFFF<FFFFFFFF<FFFFFFBF'
-                'F<<FFFFFFFFFFFFFFFFFFFB<B#F<<<#FFBBBBB')
-        for i in [3, 7, 11]:
-            assert lines[i] == qual
-
-        cleanup()
+        assert len(lines) == 8
 
-class TestCLI:
-    def test_simple_single_cli(self):
-        """This test is to make sure the cli functions."""
-        pref = 'test_data/test_single'
-        c = ('python find_intersecting_snps.py {}.sort.bam '
-             'test_data/snps'.format(pref))
-        subprocess.check_call(c, shell=True)
-
-        # file_name = pref + ".sort.bam"
-        # keep_bam = pref + '_filtered.bam'
-        # run(remap_name, 'test_data/test_single.remapped.bam', keep_bam,
-        #     remap_num_name, is_paired_end)
-
-        # lines = read_bam(keep_bam)
-        # assert len(lines) == 1
-
-        seq = ('CATCAAGCCAGCCTTCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCC'
-               'TCCCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGT')
-        qual = ('BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'
-                'FFFFFBFF<FFFFFFFFFBFFFFFFFFFFFFFFFFFFF')
-        with gzip.open('test_data/test_single.sort.remap.fq.gz') as f:
+        seqs = [lines[1], lines[5]]
+        sys.stderr.write("SEQS: %s\n" % repr(seqs))
+
+        # read representing 1st haplotype from samp2
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        
+        # read representing 2nd haplotype from samp2
+        l = list(test_data.read1_seqs[0])
+        l[3] = 'A'
+        new_seq2 = "".join(l)
+
+        assert len(seqs) == 2
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+
+
+        #
+        # Check the new reads are named correctly
+        #
+        assert lines[0] == "@read1.1.1.2"
+        assert lines[4] == "@read1.1.2.2"
+        
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+                
+
+        # run using only samp1 
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False,
+                                    is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename,
+                                    samples=["samp1"])
+
+        # Verify new fastq is correct. There should be 1 read
+        # representing non-reference haplotype from samp1
+        with gzip.open(test_data.fastq_remap_filename) as f:
             lines = [x.strip() for x in f.readlines()]
         assert len(lines) == 4
-        assert lines[1] == seq
-        assert lines[3] == qual
 
+        seqs = [lines[1]]
+        sys.stderr.write("SEQS: %s\n" % repr(seqs))
+
+        # read representing 1st haplotype from samp1
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'A'
+        new_seq1 = "".join(l)
+
+        assert new_seq1 in seqs
+
+        # Check the new read is named correctly
+        assert lines[0] == "@read1.1.1.1"
+        
+        # Verify to.remap bam is the same as the input bam file.
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+        
+        test_data.cleanup()
+
+
+        
+    def test_haplotypes_one_read_ten_snps(self):
+        """Test whether 1 read overlapping 10 SNPs works correctly"""
+
+        snp_list = [['test_chrom', x, "A", "C"] for x in range(1, 11)]
+
+
+        # generate all possible 1024 haplotype configurations for 10 SNPs:
+        import itertools
+        haplotypes = np.array([x for x in itertools.product([0,1], repeat=10)])
+        haplotypes = haplotypes.T
+
+        test_data = Data(snp_list=snp_list,
+                         haplotypes=haplotypes)
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False, is_sorted=False,
+                                    max_seqs=10,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename)
+
+        #
+        # Verify new fastq is correct. There should be no reads,
+        # because reads with greater than 10 allelic combinations
+        # are thrown out
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 0
+
+        #
+        # Verify to.remap bam is empty
+        #
+        lines = read_bam(test_data.bam_remap_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        #
+        # re-run find intersecting SNPs but allow a max of 1024
+        # allelic combinations (we expect 1023 new seqs with 10
+        # bi-allelic SNPs)
+        #
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False, is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename,
+                                    max_snps=10,
+                                    max_seqs=1024)
+
+        #
+        # Verify new fastq is correct. There should be 1023 reads
+        # with all possible configurations of the two alleles, except
+        # for the original configuration.
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 4*1023
+
+        # get every 4th line, which correspond to sequences starting at line 1
+        seqs = [lines[x] for x in range(1, len(lines), 4)]
+
+        # test a few combinations of alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[3] = 'C'
+        new_seq2 = "".join(l)
+
+        # read with 3 non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        l[3] = 'C'
+        l[9] = 'C'
+        new_seq3 = "".join(l)
+
+        # read with 10 non-ref alleles
+        l = list(test_data.read1_seqs[0])
+        for i in range(10):
+            l[i] = 'C'
+        new_seq4 = "".join(l)
+
+        assert len(seqs) == 1023
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+        assert new_seq4 in seqs
+
+        #
+        # Verify to.remap bam is the same as the input bam file.
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
+        assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        
+        #
+        # TEST WITH 10 SNPs but a limited number of possible haplotypes
+        #
+
+        haplotypes = np.array([[1, 1, 1, 0, 0, 0],
+                               [1, 0, 1, 0, 1, 0],
+                               [1, 0, 1, 0, 1, 0],
+                               [1, 0, 1, 0, 1, 0],
+                               [1, 0, 1, 0, 1, 0],
+                               [1, 0, 1, 0, 1, 0],
+                               [1, 0, 1, 0, 1, 0],
+                               [1, 0, 1, 0, 1, 0],
+                               [1, 0, 1, 0, 1, 0],
+                               [1, 0, 1, 0, 1, 0]])
+
+        test_data = Data(snp_list=snp_list,
+                         haplotypes=haplotypes)
+
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_single_bowtie2()
+        test_data.sam2bam()
+                                       
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    is_paired_end=False, is_sorted=False,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename,
+                                    max_snps=10,
+                                    max_seqs=1024)
+
+        #
+        # Verify new fastq is correct. There should be 3 reads
+        # (there are 4 unique haplotypes, and one of them is 
+        # the original configuration)
+        #
+        with gzip.open(test_data.fastq_remap_filename) as f:
+            lines = [x.strip() for x in f.readlines()]
+        assert len(lines) == 4*3
+
+        # get every 4th line, which correspond to sequences starting at line 1
+        seqs = [lines[x] for x in range(1, len(lines), 4)]
+
+        # test whether haplotypes present in read
+        l = list(test_data.read1_seqs[0])
+        for i in range(10):
+            l[0] = 'C'            
+        new_seq1 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        l[0] = 'C'
+        new_seq2 = "".join(l)
+
+        l = list(test_data.read1_seqs[0])
+        for i in range(1, 10):
+            l[0] = 'C'            
+        new_seq3 = "".join(l)
+
+        assert len(seqs) == 3
+        assert new_seq1 in seqs
+        assert new_seq2 in seqs
+        assert new_seq3 in seqs
+
+        #
         # Verify to.remap bam is the same as the input bam file.
-        old_lines = read_bam('test_data/test_single.sort.sort.bam')
-        new_lines = read_bam('test_data/test_single.sort.to.remap.bam')
+        #
+        old_lines = read_bam(test_data.bam_filename)
+        new_lines = read_bam(test_data.bam_remap_filename)
         assert old_lines == new_lines
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+        
+        test_data.cleanup()
+
+
+
+
+
+class TestHaplotypesPairedEnd:
+        
+    def test_haplotypes_paired_two_reads_two_snps(self):
+        """Test of whether 2 PE reads with both ends overlapping
+        SNPs work correctly"""
+        test_data = Data()
+
+        read1_seqs = ["AACGAAAAGGAGAA",
+                      "AAAAAAATTTAAAA"]
+        read2_seqs = ["AAGAAACAACACAA",
+                      "AAAAATAAAAAATA"]
+        
+        read1_quals = ["B" * len(read1_seqs[0]),
+                       "C" * len(read1_seqs[1])]
+        read2_quals = ["D" * len(read2_seqs[0]),
+                       "E" * len(read2_seqs[1])]
+
+        #                       10        20        30
+        # POS           123456789012345678901234567890
+        # read1[0]          AACGAAAAGGAGAA
+        # read1[1]                      AAAAAAATTTAAAA
+        # SNP                            ^     ^
+        genome_seq =  ["AAAAAACGAAAAGGAGAAAAAAATTTAAAA\n"
+                       "TTTATTTTTTATTTTTTTGTGTTGTTTCTT"]
+        # read2[0]                      AACACAACAAAGAA
+        # read2[1]        ATAAAAAATAAAAA
+        # SNP                   ^                    
+        # POS           123456789012345678901234567890
+        #                       40        50
+        
+        snp_list = [['test_chrom', 18, "A", "C"],
+                    ['test_chrom', 24, "T", "G"],
+                    ['test_chrom', 39, "T", "G"]]
+
+
+        
+        haplotypes = np.array([[1, 1, 1, 1, 0, 1, 0, 1],
+                               [1, 1, 1, 1, 0, 1, 0, 1],
+                               [1, 1, 1, 0, 0, 0, 0, 1]])
+        
         
-        cleanup()
+        test_data = Data(genome_seqs=genome_seq,
+                         read1_seqs=read1_seqs,
+                         read2_seqs=read2_seqs,
+                         read1_quals=read1_quals,
+                         read2_quals=read2_quals,
+                         snp_list=snp_list,
+                         haplotypes=haplotypes)
+        
+        test_data.setup()
+        test_data.index_genome_bowtie2()
+        test_data.map_paired_bowtie2()
+        test_data.sam2bam()
+
+        find_intersecting_snps.main(test_data.bam_filename,
+                                    snp_tab_filename=test_data.snp_tab_filename,
+                                    snp_index_filename=test_data.snp_index_filename,
+                                    haplotype_filename=test_data.haplotype_filename,
+                                    is_paired_end=True, is_sorted=False)
+
+        expect_reads = set([("AACGAAAAGGAGAC", "AAGAAACAACACAA"),
+                            ("AAAAAAATTTAAAA", "AAAAATACAAAATA"),
+                            ("ACAAAAAGTTAAAA", "AAAAATACAAAATA"),
+                            ("ACAAAAAGTTAAAA", "AAAAATAAAAAATA")])
+
+        #
+        # Verify fastq1 and fastq2 have appropriate read pairs
+        #
+        with gzip.open(test_data.fastq1_remap_filename) as f:
+            lines1 = [x.strip() for x in f.readlines()]
+        assert len(lines1) == len(expect_reads) * 4
+
+        with gzip.open(test_data.fastq2_remap_filename) as f:
+            lines2 = [x.strip() for x in f.readlines()]
+        assert len(lines2) == len(expect_reads) * 4
+        for i in range(1, len(lines2), 4):
+            read_pair = (lines1[i], lines2[i])
+            assert read_pair in expect_reads
+            expect_reads.remove(read_pair)
+
+        assert len(expect_reads) == 0
+
+        #
+        # Verify that the keep file is empty since only
+        # read needs to be remapped. Note that the
+        # read_bam still gives back one empty line.
+        #
+        lines = read_bam(test_data.bam_keep_filename)
+        assert len(lines) == 1
+        assert lines[0] == ''
+
+        test_data.cleanup()
 
+        # TODO: test when only one half of read maps
diff --git a/mapping/test_rmdup.py b/mapping/test_rmdup.py
new file mode 100644
index 0000000..80ccd25
--- /dev/null
+++ b/mapping/test_rmdup.py
@@ -0,0 +1,161 @@
+import sys
+import os
+import subprocess
+
+import filter_remapped_reads
+import util
+import rmdup_pe
+
+#
+# rmdump_pe.py <input_bam> <output_bam>
+#
+
+
+def write_sam_header(f):
+    f.write("@HD	VN:1.0	SO:coordinate\n")
+    f.write("@SQ	SN:chr22	LN:51304566\n")
+    f.write('@PG	ID:bowtie2	PN:bowtie2	VN:2.2.6	CL:"/iblm/netapp/home/gmcvicker/anaconda2/bin/bowtie2-align-s --wrapper basic-0 -x /iblm/netapp/data1/external/GRC37/combined/bowtie2_index/hg37 -1 /tmp/16686.inpipe1 -2 /tmp/16686.inpipe2\n')
+
+
+
+def write_bam_pe(data_dir="test_data", bam_filename="test_data/rmdup_input.bam"):
+    sam_lines = ["readpair1	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                 
+                 # duplicate of first read pair 
+                 "dup_readpair1	163	chr22	100	12	101M	=	200	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                 "readpair2	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+
+                 # readpair 3 has same first read, but different second read as readpair2
+                 "readpair3	163	chr22	150	12	101M	=	251	202	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+
+                 # readpair4 has same positions as readpair2
+                 "readpair4	163	chr22	150	12	101M	=	250	201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                 
+                 "dup_readpair1	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
+                 
+                 "readpair1	83	chr22	200	12	101M	=	100	-201	TCCTGACAGCATGTGCCCAAGGTGGTCAGGATACAGCTTGCTTCTATATATTTTAGGGAGAAAATACATCAGCCTGTAAACAAAAAATTAAATTCTAAGGT	DDDDDDDDDDDDDDEDEEEFFFFHHFHHIIFIIJJJJIJJJJJJJJJJIIJJJIIIJIJIJJJJIFIIIJJIJJJJJJJIIJJJJJJJHHHHHFFFFFCCC	AS:i:0	XS:i:-12	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:101	YS:i:-11	YT:Z:CP",
+                
+                 "readpair2	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+
+                 "readpair4	163	chr22	250	12	101M	=	150	-201	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+
+                 "readpair3	163	chr22	251	12	101M	=	150	-202	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",                 
+                 # couple of read pairs that are completely overlapping
+                 # (i.e. at same position)
+                 "readpair5	163	chr22	500	12	101M	=	500	-101	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                 
+                 "readpair5	163	chr22	500	12	101M	=	500	-101	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                "dup_readpair5	163	chr22	500	12	101M	=	500	-101	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP",
+                 "dup_readpair5	163	chr22	500	12	101M	=	500	-101	TGGAGACATAAAATGAGGCATATCTGACCTCCACTTCCAAAAACATCTGAGATAGGTCTCAGTTAATTAAGAAAGTTTGTTCTGCCTAGTTTAAGGACATG	CCCFFFFFHHHHHJJJJJJJJJJJJJJJIJJJJJJJJJJJJJJJJJIJJIHIJJJJEHIJJJHJJJJJJJJJJJJ=DHHHHHFFFFFFEEEEEEDDCDDDC	AS:i:-11	XS:i:-17	XN:i:0	XM:i:2	XO:i:0	XG:i:0	NM:i:2	MD:Z:7G44C48	YS:i:0	YT:Z:CP"
+         
+    ]
+
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+        
+    # write temporary file in SAM format, before converting to BAM
+    sam_filename = data_dir + "/tmp.sam"
+    f = open(sam_filename, "w")
+    write_sam_header(f)
+    for line in sam_lines:
+        f.write(line + "\n")
+    f.close()
+
+    subprocess.check_call("samtools view -b %s > %s" % (sam_filename, bam_filename), shell=True)
+
+    
+def read_bam(bam):
+    """
+    Read a bam file into a list where each element of the list is a line from
+    the bam file (with the newline stripped). The header is discarded.
+    """
+    res = subprocess.check_output('samtools view %s' % bam, shell=True)
+    return res.strip().split('\n')
+
+
+def test_rmdup_pe():
+    test_dir = "test_data"
+    rmdup_input_bam = "test_data/rmdup_input.bam"
+    rmdup_output_bam = "test_data/rmdup_output.bam"
+
+    # write test input data
+    write_bam_pe(data_dir=test_dir, bam_filename=rmdup_input_bam)
+
+    # remove duplicates
+    rmdup_pe.main(rmdup_input_bam, rmdup_output_bam)
+    
+    # read in filtered reads
+    lines = read_bam(rmdup_output_bam)
+    
+    # read lines from keep BAM file
+    read_dict = {}
+    for line in lines:
+        words = line.split()
+        read_name = words[0]
+        if read_name in read_dict:
+            read_dict[read_name].append(words)
+        else:
+            read_dict[read_name] = [words]
+                    
+    # verify that filtered reads look correct
+
+    # expect one of readpair1 and dup_readpair (but not both)
+    # expect one of readpair2 and readpair4 (but not both)
+    # expect readpair3
+    # expect one of readpair5 and dup_readpair5 (but not both)
+
+    assert len(read_dict) == 4
+    
+    # expect one of readpair1 and dup_readpair (but not both)
+    if "readpair1" in read_dict:
+        assert "dup_readpair1" not in read_dict
+        reads = read_dict["readpair1"]
+    else:
+        assert "dup_readpair1" in read_dict
+        reads = read_dict["dup_readpair1"]
+        
+    assert len(reads) == 2
+
+    pos1 = int(reads[0][3])
+    pos2 = int(reads[1][3])
+    assert pos1 == 100
+    assert pos2 == 200
+
+    # expect readpair2 OR readpair4 to be present
+    if "readpair4" in read_dict:
+        assert "readpair2" not in read_dict
+        reads = read_dict["readpair4"]
+    else:
+        assert "readpair2" in read_dict
+        reads = read_dict["readpair2"]
+        
+    assert len(reads) == 2
+    pos1 = int(reads[0][3])
+    pos2 = int(reads[1][3])
+    assert pos1 == 150
+    assert pos2 == 250
+
+    # expect readpair3 to be present
+    assert "readpair3" in read_dict
+    reads =read_dict["readpair3"]
+    pos1 = int(reads[0][3])
+    pos2 = int(reads[1][3])
+    assert pos1 == 150
+    assert pos2 == 251
+
+
+    # expect readpair5 OR dup_readpair5 to be present
+    if "readpair5" in read_dict:
+        assert "dup_readpair5" not in read_dict
+        reads = read_dict["readpair5"]
+    else:
+        assert "dup_readpair5" in read_dict
+        reads = read_dict["dup_readpair5"]
+
+    
+
+    
+        
+
+    
+    
diff --git a/mapping/test_snptable.py b/mapping/test_snptable.py
new file mode 100644
index 0000000..4519017
--- /dev/null
+++ b/mapping/test_snptable.py
@@ -0,0 +1,253 @@
+
+
+import snptable
+import gzip
+import os
+
+import numpy as np
+
+import pysam
+
+
+
+
+class Data():
+
+    def __init__(self, data_dir="test_data",
+                 snp_filename="test_data/snp_tab.txt.gz",
+                 sam_filename="test_data/test.sam"):
+        self.data_dir = data_dir
+        self.snp_filename = snp_filename
+        self.sam_filename = sam_filename
+
+        self.snp_list = [(10, "A", "C"),
+                         (20, "T", "G"),
+                         (100, "A", "T")]
+        
+        if not os.path.exists(data_dir):
+            os.makedirs(data_dir)
+
+
+    def setup(self):
+        snp_file = gzip.open(self.snp_filename, "wb")
+
+        for snp in self.snp_list:
+            snp_file.write("%d %s %s\n" % (snp[0], snp[1], snp[2]))
+        snp_file.close()
+
+        
+    
+    def write_sam_header(self, sam_file):
+        sam_file.write("@HD\tVN:1.0\tSO:unsorted\n")
+        sam_file.write("@SQ\tSN:test_chrom\tLN:60\n")
+        sam_file.write("@PG\tID:bowtie2\tPN:bowtie2\tVN:2.2.6"
+                       "\tCL:\"/iblm/netapp/home/gmcvicker/anaconda2/bin/"
+                       "bowtie2-align-s --wrapper basic-0 -x "
+                       "test_data/test_genome -S test_data/test.sam -U "
+                       "test_data/test_1.fq\"\n")
+
+
+    def write_sam_read(self, sam_file, read_name="read1#0/1",
+                       flag=0, chrom="test_chrom", pos=1,
+                       mapq=30, cigar="30M", rnext="*",
+                       pnext=0, tlen=0, seq="AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
+                       qual="BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB",
+                       opt_fields=["AS:i:0", "XS:i:-5", "XN:i:0",
+                                   "XM:i:0", "XO:i:0", "XG:i:0", "NM:i:0",
+                                   "MD:Z:30", "YT:Z:UU"]):
+        sam_file.write("\t".join([read_name, "%d" % flag, chrom, "%d" % pos,
+                                 "%d" % mapq, cigar, rnext, "%d" % pnext,
+                                 "%d" % tlen, seq, qual,
+                                  "\t".join(opt_fields)]) + "\n")
+
+
+class TestReadFile(object):
+    
+    def test_read_snps(self):
+        data = Data()
+        data.setup()
+        
+        snp_tab = snptable.SNPTable()
+        snp_tab.read_file(data.snp_filename)
+
+        # check snp_index set correctly
+        assert len(snp_tab.snp_index) == 100
+        assert snp_tab.snp_index[9] == 0
+        assert snp_tab.snp_index[19] == 1
+        assert snp_tab.snp_index[99] == 2
+        # only 3 values of index should be non -1
+        assert np.where(snp_tab.snp_index != -1)[0].shape[0] == 3
+
+        # check snp_allele set correctly
+        assert snp_tab.snp_allele1[0] == "A"
+        assert snp_tab.snp_allele2[0] == "C"
+        assert snp_tab.snp_allele1[1] == "T"
+        assert snp_tab.snp_allele2[1] == "G"
+        assert snp_tab.snp_allele1[2] == "A"
+        assert snp_tab.snp_allele2[2] == "T"
+
+        # check that snp_pos set correctly
+        assert snp_tab.snp_pos[0] == 10
+        assert snp_tab.snp_pos[1] == 20
+        assert snp_tab.snp_pos[2] == 100
+
+
+    def test_read_indels(self):
+        data = Data()
+        data.snp_list = [(10, "A", "-"), # 1bp deletion
+                         (20, "A", "ATTG"), # 3bp insertion
+                         (21, "A", "T"), # not an indel
+                         (3, "AAA", "A")] # 2bp deletion
+        
+        data.setup()
+
+        snp_tab = snptable.SNPTable()
+        snp_tab.read_file(data.snp_filename)
+
+        # check snp_index set correctly
+        assert len(snp_tab.snp_index) == 21
+        assert snp_tab.snp_index[9] == 0
+        assert snp_tab.snp_index[19] == 1
+        assert snp_tab.snp_index[2] == 3
+        
+        # only 4 values of index should be non -1
+        assert np.where(snp_tab.snp_index != -1)[0].shape[0] == 4
+
+        # check snp_allele set correctly
+        assert snp_tab.snp_allele1[0] == "A"
+        assert snp_tab.snp_allele2[0] == ""
+        assert snp_tab.snp_allele1[1] == "A"
+        assert snp_tab.snp_allele2[1] == "ATTG"
+        assert snp_tab.snp_allele1[3] == "AAA"
+        assert snp_tab.snp_allele2[3] == "A"
+
+        # check that snp_pos set correctly
+        assert snp_tab.snp_pos[0] == 10
+        assert snp_tab.snp_pos[1] == 20
+        assert snp_tab.snp_pos[2] == 21
+        assert snp_tab.snp_pos[3] == 3
+
+
+class TestGetOverlappingSNPs:
+        
+    def test_get_overlapping_snps_simple(self):
+        """Do a simple test of getting 2 overlapping SNPs
+        with a read with 30 matches"""
+        data = Data()
+        data.setup()
+
+        # write a single read with all matches to SAM
+        sam_file = open(data.sam_filename, "w")
+        data.write_sam_header(sam_file)
+        data.write_sam_read(sam_file)
+        sam_file.close()
+
+        sam_file = pysam.Samfile(data.sam_filename)
+        read = sam_file.next()
+
+        # simple case where read has only one big match segment
+        snp_tab = snptable.SNPTable()
+        snp_tab.read_file(data.snp_filename)
+        snp_idx, snp_read_pos, \
+            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)
+
+        # check that overlapping SNPs are found and in correct locations
+        assert len(snp_idx) == 2
+        assert snp_idx[0] == 0
+        assert snp_idx[1] == 1
+        
+        assert snp_read_pos[0] == 10
+        assert snp_read_pos[1] == 20
+
+        assert len(indel_idx) == 0
+        assert len(indel_read_pos) == 0
+
+        
+
+    def test_get_overlapping_snps_intron(self):
+        """Test a read spanning an intron (N in CIGAR string)"""
+        data = Data()
+        data.setup()
+
+        # write a single read with intron in CIGAR (N)
+        sam_file = open(data.sam_filename, "w")
+        data.write_sam_header(sam_file)
+        data.write_sam_read(sam_file, cigar="10M85N20M")
+        sam_file.close()
+        
+        sam_file = pysam.Samfile(data.sam_filename)
+        read = sam_file.next()
+
+        snp_tab = snptable.SNPTable()
+        snp_tab.read_file(data.snp_filename)
+        snp_idx, snp_read_pos, \
+            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)
+
+        # check that overlapping SNPs are found and in correct locations
+        assert len(snp_idx) == 2
+        assert snp_idx[0] == 0
+        assert snp_idx[1] == 2
+        
+        assert snp_read_pos[0] == 10
+        assert snp_read_pos[1] == 15
+
+    
+    
+    def test_get_overlapping_snps_softclip(self):
+        """Test that soft-clipped part of read is not used"""
+        data = Data()
+        data.setup()
+
+        # write a single read with softclipping on left end
+        sam_file = open(data.sam_filename, "w")
+        data.write_sam_header(sam_file)
+        data.write_sam_read(sam_file, cigar="10S20M")
+        sam_file.close()
+        
+        sam_file = pysam.Samfile(data.sam_filename)
+        read = sam_file.next()
+
+        snp_tab = snptable.SNPTable()
+        snp_tab.read_file(data.snp_filename)
+        snp_idx, snp_read_pos, \
+            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)
+
+        # check that overlapping SNPs are found and in correct locations
+        assert len(snp_idx) == 2
+        assert snp_idx[0] == 0
+        assert snp_idx[1] == 1
+        assert snp_read_pos[0] == 20
+        assert snp_read_pos[1] == 30
+
+
+    def test_get_overlapping_indel(self):
+        """Test that indels can be correctly obtained"""
+        data = Data()
+        data.snp_list = [(10, "A", "-")]
+        data.setup()
+
+        # write a single read with match
+        sam_file = open(data.sam_filename, "w")
+        data.write_sam_header(sam_file)
+        data.write_sam_read(sam_file, cigar="30M")
+        sam_file.close()
+        
+        sam_file = pysam.Samfile(data.sam_filename)
+        read = sam_file.next()
+
+        snp_tab = snptable.SNPTable()
+        snp_tab.read_file(data.snp_filename)
+        snp_idx, snp_read_pos, \
+            indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)
+
+        # check that overlapping indel found in correct location
+        assert len(snp_idx) == 0
+        assert len(indel_idx) == 1
+        assert indel_idx[0] == 0
+        assert indel_read_pos[0] == 10
+        
+        
+        
+
+    
+
diff --git a/mapping/util.py b/mapping/util.py
new file mode 100644
index 0000000..0547894
--- /dev/null
+++ b/mapping/util.py
@@ -0,0 +1,62 @@
+import sys
+import string
+import subprocess
+import os
+
+
+DNA_COMP = None
+
+def comp(seq_str):
+    """complements the provided DNA sequence and returns it"""
+    global DNA_COMP
+
+    if DNA_COMP is None:
+        DNA_COMP = string.maketrans("ATCGMRWSYKNatcgmrwsykn",
+                                    "TAGCKYWSRMNtagckywsrmn")
+    return seq_str.translate(DNA_COMP)
+
+
+def revcomp(seq_str):
+    """returns reverse complement of provided DNA sequence"""
+    return comp(seq_str)[::-1]
+
+        
+def sort_bam(input_bam, output_prefix):
+    """Calls samtools sort on input_bam filename and writes to
+    output_bam. Takes into account that the command line arguments 
+    for samtools sort have changed between versions."""
+
+    output_bam = output_prefix + ".sort.bam"
+    
+    # first try new way of using samtools sort
+    failed = False
+    cmd = "samtools sort -o " + output_bam + " " + input_bam
+    sys.stderr.write("running command: %s\n" % cmd)
+    try:
+        subprocess.check_call(cmd, shell=True)
+    except Exception as e:
+        sys.stderr.write("samtools sort command failed:\n%s\n" %
+                         str(e))
+        failed = True
+    if not os.path.exists(output_bam):
+        sys.stderr.write("output file %s does not exist\n" % output_bam)
+        failed = True
+        
+    if failed:
+        # OLD way of calling samtools (changed in newer versions)
+        sys.stderr.write("samtools sort command failed, trying old samtools "
+                         "syntax\n")
+        
+        cmd = "samtools sort " + input_bam + " " + output_prefix
+        sys.stderr.write("running command: %s\n" % cmd)
+
+        try:
+            subprocess.check_call(cmd, shell=True)
+        except Exception as e:
+            sys.stderr.write("samtools sort command failed:\n%s\n" %
+                             str(e))
+            exit(1)
+        
+        if not os.path.exists(paths.sorted_output_bam):
+            raise IOError("Failed to create sorted BAM file '%s'" %
+                          paths.sorted_output_bam)
diff --git a/snp2h5/Makefile b/snp2h5/Makefile
index 1828595..c2b8489 100644
--- a/snp2h5/Makefile
+++ b/snp2h5/Makefile
@@ -6,9 +6,9 @@ LIB=-lz -lm -lhdf5 -lhdf5_hl
 
 INCLUDE=-I$(HDF_INSTALL)/include
 LIBSHDF=$(EXTLIB)
-CFLAGS=-g -DH5_USE_16_API $(INCLUDE)
+CFLAGS=-DH5_USE_16_API $(INCLUDE) -Wall
 
-objects=vcf.o impute.o util.o memutil.o err.o chrom.o snptab.o seq.o nuc.o
+objects=vcf.o impute.o util.o memutil.o err.o chrom.o snptab.o seq.o nuc.o sampletab.o
 
 default: all
 
diff --git a/snp2h5/fasta2h5.c b/snp2h5/fasta2h5.c
index 6849c22..1e6ee33 100644
--- a/snp2h5/fasta2h5.c
+++ b/snp2h5/fasta2h5.c
@@ -91,8 +91,7 @@ void usage(char **argv) {
 
 
 void parse_args(Arguments *args, int argc, char **argv) {
-  int c, i;
-  char *format_str = NULL;
+  int c;
   
    static struct option loptions[] = {
      {"chrom",     required_argument, 0, 'c'},
@@ -243,15 +242,15 @@ void parse_fasta(Arguments *args, H5VectorInfo *seq_vec_info) {
   for(i = 0; i < args->n_input_files; i++) {
     chrom = chrom_guess_from_file(args->input_files[i],
 				  all_chroms, n_chrom);
-    
-    fprintf(stderr, "chromosome: %s, length: %ldbp\n",
-	    chrom->name, chrom->len);
-    
+        
     if(chrom == NULL) {
       my_err("%s:%d: could not guess chromosome from filename "
 	     "%s\n", __FILE__, __LINE__, args->input_files[i]);
     }
 
+    fprintf(stderr, "chromosome: %s, length: %ldbp\n",
+	    chrom->name, chrom->len);
+
     /* seq sequence from fasta file */
     seq_read_fasta_from_file(seq, args->input_files[i]);
     
diff --git a/snp2h5/get_sample_names.py b/snp2h5/get_sample_names.py
new file mode 100644
index 0000000..c9ce6c5
--- /dev/null
+++ b/snp2h5/get_sample_names.py
@@ -0,0 +1,34 @@
+
+import tables
+import sys
+
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Writes names of samples "
+                                     "contained in HDF5 file to stdout")
+                                   
+    parser.add_argument("h5file", help="HDF5 file containing /samples table")
+
+    options = parser.parse_args()
+    
+    h5f = tables.openFile(options.h5file)
+
+    for node in h5f.root:
+        if node.name.startswith("samples"):
+            _, chr_name = node.name.split("_", 1)
+
+            sys.stdout.write("%s:\n" % chr_name)
+            for row in node:
+                sys.stdout.write("  %s\n" % row['name'])
+            sys.stdout.write("\n")
+    else:
+        sys.stderr.write("%s does not contain samples table\n" % options.h5file)
+        exit(2)
+
+    h5f.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snp2h5/impute.c b/snp2h5/impute.c
index 74dc7dc..4f777ca 100644
--- a/snp2h5/impute.c
+++ b/snp2h5/impute.c
@@ -63,12 +63,12 @@ long impute_count_fields(gzFile fh) {
 }
 
 
-void impute_parse_haplotypes(char *haplotypes, char *cur, long n_samples) {
-  long expect_n, i, val;
+void impute_parse_haplotypes(char *haplotypes, char *cur, long n_sample) {
+  long expect_n, i;
   char *tok;
   char delim[] = " \t";
   
-  expect_n = n_samples * 2;
+  expect_n = n_sample * 2;
 
   i = 0;
   while((tok = strsep(&cur, delim)) != NULL) {
@@ -100,12 +100,12 @@ void impute_parse_haplotypes(char *haplotypes, char *cur, long n_samples) {
 
 
 
-void impute_parse_geno_probs(float *geno_probs, char *cur, long n_samples) {
-  long expect_n, i, val;
+void impute_parse_geno_probs(float *geno_probs, char *cur, long n_sample) {
+  long expect_n, i;
   char delim[] = " \t";
   char *tok;
   
-  expect_n = n_samples * 3;
+  expect_n = n_sample * 3;
 
   i = 0;
   while((tok = strsep(&cur, delim)) != NULL) {
@@ -136,11 +136,11 @@ void impute_parse_geno_probs(float *geno_probs, char *cur, long n_samples) {
  * 
  * If geno_probs array is non-null genotype probabilities are parsed and
  * stored in the provided array. The array must be of length
- * n_samples*3.
+ * n_sample*3.
  *
  * If haplotypes array is non-null phased genotypes are parsed and
  * stored in the provided array. The array must be of length
- * n_samples*2.
+ * n_sample*2.
  *
  * IMPUTE files contain EITHER haplotypes OR genotypes so only
  * one of geno_probs or haplotypes should be non-null (at most).
@@ -150,7 +150,7 @@ void impute_parse_geno_probs(float *geno_probs, char *cur, long n_samples) {
 int impute_read_line(gzFile fh, ImputeInfo *impute_info, SNP *snp,
 		     float *geno_probs, char *haplotypes) {
   char *cur, *token;
-  int n_fix_header, ref_len, alt_len;
+  int alt_len;
   size_t tok_num;
   const char delim[] = " \t";
 
@@ -202,10 +202,11 @@ int impute_read_line(gzFile fh, ImputeInfo *impute_info, SNP *snp,
 	   "both requested\n");
   }
   else if(geno_probs) {
-    impute_parse_geno_probs(geno_probs, cur, impute_info->n_samples);
+    impute_parse_geno_probs(geno_probs, cur, impute_info->n_sample);
   }
   else if(haplotypes) {
-    impute_parse_haplotypes(haplotypes, cur, impute_info->n_samples);
+    impute_parse_haplotypes(haplotypes, cur, impute_info->n_sample);
   }
 
+  return 0;
 }
diff --git a/snp2h5/impute.h b/snp2h5/impute.h
index 5faab79..8b55f03 100644
--- a/snp2h5/impute.h
+++ b/snp2h5/impute.h
@@ -11,7 +11,7 @@
 
 
 typedef struct {
-  int n_samples;
+  int n_sample;
 
   /* used for reading lines */
   size_t buf_size;
diff --git a/snp2h5/sample.h b/snp2h5/sample.h
new file mode 100644
index 0000000..990fbb9
--- /dev/null
+++ b/snp2h5/sample.h
@@ -0,0 +1,13 @@
+#ifndef __SAMPLE_H__
+#define __SAMPLE_H__
+
+
+/* maximum length of sample name */
+#define SAMPLE_MAX_NAME 100
+
+typedef struct {
+  char name[SAMPLE_MAX_NAME];
+} Sample;
+
+
+#endif
diff --git a/snp2h5/sampletab.c b/snp2h5/sampletab.c
new file mode 100644
index 0000000..4f41f2a
--- /dev/null
+++ b/snp2h5/sampletab.c
@@ -0,0 +1,149 @@
+#include <string.h>
+#include <stdio.h>
+
+#include <hdf5.h>
+#include <hdf5_hl.h>
+
+
+#include "memutil.h"
+#include "sampletab.h"
+#include "err.h"
+#include "util.h"
+
+
+/**
+ * Allocates memory for and initializes data structure
+ * that describes the HDF5 table used to store Samples data.
+ *
+ */
+SampleTab *sample_tab_new(hid_t h5file, const char *chrom_name,
+			  size_t n_record) {
+  herr_t status;
+  SampleTab *tab;
+  Sample sample_desc;
+
+  const char *field_names[] = {"name"};
+
+  tab = my_malloc(sizeof(SampleTab));
+  
+  tab->h5file = h5file;  
+  
+  /* set datatypes for each field */
+  tab->name_type = H5Tcopy(H5T_C_S1);
+  H5Tset_size(tab->name_type, SAMPLE_MAX_NAME);
+  tab->field_type[0] = tab->name_type; /* name */
+
+  /* sizes of record and each field */
+  tab->record_size = sizeof(Sample);
+  tab->field_size[0] = sizeof(sample_desc.name);
+  
+  /* offsets of each field */
+  tab->field_offset[0] = HOFFSET(Sample, name);
+    
+  /* title and name of table */
+  tab->title = util_str_concat(chrom_name, " samples", NULL);
+  tab->name = util_str_concat("samples_", chrom_name, NULL);
+
+  /* set chunk size and compression */
+  tab->chunk_size = SAMPLETAB_CHUNK_SIZE;
+  tab->compress = 1;
+
+  tab->n_record = 0;
+  
+  status = H5TBmake_table(tab->title, tab->h5file, tab->name,
+			  SAMPLETAB_N_FIELDS, n_record,
+			  tab->record_size, field_names,
+			  tab->field_offset, tab->field_type,
+			  tab->chunk_size, NULL, tab->compress, NULL);
+  
+  if(status < 0) {
+    my_err("%s:%d: could not create samples table "
+	   "for chromosome %s\n", chrom_name,
+	   __FILE__, __LINE__);
+  }
+
+  return tab;
+}
+
+
+void sample_tab_free(SampleTab *tab) {
+  H5Tclose(tab->name_type);
+  my_free(tab->title);
+  my_free(tab->name);
+  
+  my_free(tab);
+}
+
+
+/**
+ * Creates a new samples table HDF5 file pointed to by h5file handle
+ * and populates it using provided array of samples
+ */
+SampleTab *sample_tab_create(hid_t h5file, const char *chrom_name,
+			     Sample *samples, size_t n_sample) {
+  SampleTab *tab;
+  int i;
+
+  tab = sample_tab_new(h5file, chrom_name, n_sample);
+  
+  for(i = 0; i < n_sample; i++) {
+    sample_tab_append_row(tab,  &samples[i]);
+  }
+  
+  return tab;
+}
+
+
+
+SampleTab *sample_tab_from_names(hid_t h5file, const char *chrom_name,
+				 char **sample_names, size_t n_sample) {
+  Sample *samples;
+  SampleTab *samp_tab;
+  int i;
+  
+  samples = my_malloc(sizeof(Sample) * n_sample);
+      
+  for(i = 0; i < n_sample; i++) {
+    util_strncpy(samples[i].name, sample_names[i],
+		 sizeof(samples[i].name));
+  }
+
+  samp_tab = sample_tab_create(h5file, chrom_name, samples, n_sample);
+  
+  my_free(samples);
+
+  return samp_tab;
+}
+
+
+
+/**
+ * Appends row to table described by provided SampleTab datastructure.
+ */
+void sample_tab_append_row(SampleTab *tab, Sample *data) {
+  herr_t status;
+  hsize_t n_to_write;
+
+  n_to_write = 1;
+
+  /* fprintf(stderr, "writing sample record:\n" */
+  /* 	  "data: %s\n"  */
+  /* 	  "n_record: %d\n" */
+  /* 	  "record_size: %d\n" */
+  /* 	  "field_offset[0]: %d\n" */
+  /* 	  "field_size[0]: %d\n", data->name, tab->n_record, tab->record_size, */
+  /* 	  tab->field_offset[0], tab->field_size[0]); */
+  
+  status = H5TBwrite_records(tab->h5file, tab->name,
+			     tab->n_record, n_to_write,
+			     tab->record_size, tab->field_offset,
+			     tab->field_size, data);
+
+  tab->n_record += 1;
+  
+  if(status < 0) {
+    my_err("%s:%d: failed to write record to Sample table\n",
+	   __FILE__, __LINE__);
+  }
+}
+
diff --git a/snp2h5/sampletab.h b/snp2h5/sampletab.h
new file mode 100644
index 0000000..4ab9af4
--- /dev/null
+++ b/snp2h5/sampletab.h
@@ -0,0 +1,59 @@
+#ifndef __SAMPLETAB_H__
+#define __SAMPLETAB_H__
+
+#include <hdf5.h>
+
+#include "sample.h"
+
+
+#define SAMPLETAB_N_FIELDS 1
+
+/* chunk size affects performance a lot
+ * small chunks = much faster writing of tables, but
+ * worse compression
+ */
+
+#define SAMPLETAB_CHUNK_SIZE 100
+
+
+/* SampleTab holds information about Samples
+ * and datatypes of each field. Currently
+ * there is only a single field which is the 
+ * name of the sample, however you could imagine
+ * adding other information such as sex, 
+ * age, population, etc.
+ */
+
+typedef struct {
+  hid_t h5file;
+  size_t record_size;
+  char *name;
+  char *title;
+  hid_t field_type[SAMPLETAB_N_FIELDS];
+  size_t field_size[SAMPLETAB_N_FIELDS];
+  size_t field_offset[SAMPLETAB_N_FIELDS];
+  
+  hid_t name_type;
+  
+  int compress;
+  size_t chunk_size;
+
+  size_t n_record;
+} SampleTab;
+
+
+SampleTab *sample_tab_new(hid_t h5file, const char *chrom_name,
+			  size_t n_record);
+
+SampleTab *sample_tab_create(hid_t h5file, const char *chrom_name,
+			     Sample *samples, size_t n_sample);
+
+SampleTab *sample_tab_from_names(hid_t h5file, const char *chrom_name,
+				 char **sample_names,
+				 size_t n_sample);
+
+void sample_tab_free(SampleTab *tab);
+
+void sample_tab_append_row(SampleTab *tab, Sample *data);
+
+#endif
diff --git a/snp2h5/snp2h5.c b/snp2h5/snp2h5.c
index b4b6ac9..ab1f012 100644
--- a/snp2h5/snp2h5.c
+++ b/snp2h5/snp2h5.c
@@ -8,6 +8,8 @@
 
 #include "impute.h"
 #include "vcf.h"
+#include "sample.h"
+#include "sampletab.h"
 #include "snptab.h"
 #include "util.h"
 #include "memutil.h"
@@ -42,11 +44,13 @@ typedef struct {
   /* flag indicating format of input files (FORMAT_VCF or FORMAT_IMPUTE) */
   int format;
 
-  /* HDF5 file that genotype probabilities are written to */
-  char *geno_prob_file;
-  char *haplotype_file;
-  char *snp_index_file;
-  char *snp_tab_file;
+  /* HDF5 files that SNP info written to */
+  char *geno_prob_file; /* genotype probabilities */ 
+  char *haplotype_file; /* haplotypes & phase */
+  char *snp_index_file; /* base position => SNP table row lookup */
+  char *snp_tab_file; /* SNP table with id, alleles, etc */
+
+  char *sample_file;
   char **input_files;
   
   int n_input_files;
@@ -88,9 +92,9 @@ typedef struct {
 
 
 typedef struct {
-  long n_lines; /* number of lines in file (including header lines) */
+  long n_line; /* number of lines in file (including header lines) */
   long n_row;   /* number of data rows in file */
-  long n_samples; /* number of samples in file */
+  long n_sample; /* number of samples in file */
 
   long n_geno_prob_col; /* number of genotype prob columns */
   long n_haplotype_col; /* number of haplotype columns */
@@ -130,11 +134,18 @@ void usage(char **argv) {
 	  "     Specifies the format of the input files. Currently supported\n"
 	  "     formats are 'impute' or 'vcf'\n"
 	  "\n"
+	  "  --samples SAMPLES_FILE\n"
+	  "     Input file containing ordered names of samples, one sample\n"
+	  "     per line. This is only required for impute-formatted input\n"
+	  "     files. Sample names for VCF input files are read from \n"
+	  "     header lines in the VCF files.\n"
+	  "\n"
 	  "Output Options:\n"
 	  "  --geno_prob GENO_PROB_OUTPUT_FILE [optional]\n"
 	  "     Path to HDF5 file to write genotype probabilities to.\n"
 	  "     This option can only be used for impute2 files or VCF files\n"
-	  "     that provide genotype likelihoods, (GL in the FORMAT specifier).\n"
+	  "     that provide genotype likelihoods or posterior probabilities\n"
+	  "     (GL or GP in the VCF FORMAT specifier).\n"
 	  "\n"
 	  "  --haplotype HAPLOTYPE_OUTPUT_FILE [optional]\n"
 	  "     Path to HDF5 file to write haplotypes to. This option can only\n"
@@ -178,6 +189,7 @@ void usage(char **argv) {
 	  "    --snp_index snp_index.h5 \n"
 	  "    --snp_tab snp_tab.h5 \n"
 	  "    --haplotype haps.h5 \n"
+	  "    --samples samples_names.txt\n"
 	  "    genotypes/chr*.hg19.impute2.gz \n"
 	  "    genotypes/chr*.hg19.impute2_haps.gz\n"
 	  "\n"
@@ -186,7 +198,7 @@ void usage(char **argv) {
 
 
 void parse_args(Arguments *args, int argc, char **argv) {
-  int c, i;
+  int c;
   char *format_str = NULL;
   
    static struct option loptions[] = {
@@ -196,6 +208,7 @@ void parse_args(Arguments *args, int argc, char **argv) {
      {"haplotype", required_argument, 0, 'h'},
      {"snp_index", required_argument, 0, 'i'},
      {"snp_tab", required_argument, 0, 't'},
+     {"samples", required_argument, 0, 's'},
      {0,0,0,0}
    };
    args->chrom_file = NULL;
@@ -204,9 +217,10 @@ void parse_args(Arguments *args, int argc, char **argv) {
    args->haplotype_file = NULL;
    args->snp_index_file = NULL;
    args->snp_tab_file = NULL;
+   args->sample_file = NULL;
 
    while(1) {
-     c = getopt_long(argc, argv, "c:f:p:h:i:t:", loptions, NULL);
+     c = getopt_long(argc, argv, "c:f:p:h:i:t:s:", loptions, NULL);
      
      if(c == -1) {
        break;
@@ -219,6 +233,7 @@ void parse_args(Arguments *args, int argc, char **argv) {
        case 'h': args->haplotype_file = util_str_dup(optarg); break;
        case 'i': args->snp_index_file = util_str_dup(optarg); break;
        case 't': args->snp_tab_file = util_str_dup(optarg); break;
+       case 's': args->sample_file = util_str_dup(optarg); break;
        default: usage(argv); break;
      }
    }
@@ -267,10 +282,62 @@ void parse_args(Arguments *args, int argc, char **argv) {
        exit(-1);
      }
    }
+
+   if(args->sample_file && (args->format == FORMAT_VCF)) {
+     my_warn("ignoring sample names from --samples input file "
+	     "because using sample information from VCF headers instead\n");
+   }
+   
 }
 
 
 
+Sample *read_sample_info(Arguments *args, int *n_sample) {
+  gzFile gzf;
+  Sample *samples;
+  char *line;
+  int i;
+
+  *n_sample = 0;
+  samples = NULL;
+  
+  if(args->sample_file) {
+    /* read sample information from samples file */
+    *n_sample = util_count_lines(args->sample_file);
+
+    samples = my_malloc(sizeof(Sample) * *n_sample);
+
+    gzf = util_must_gzopen(args->sample_file, "rb");
+    i = 0;
+    while((line = util_gzgets_line(gzf)) != NULL) {
+      if(i >= *n_sample) {
+	my_err("%s:%d: more sample lines than expected in file %s\n",
+	       __FILE__, __LINE__, args->sample_file);
+      }
+      util_str_strip(line);
+      util_strncpy(samples[i].name, line, sizeof(samples[i].name));
+      my_free(line);
+      i += 1;
+    }
+    if(i != *n_sample) {
+      my_err("%s:%d: expected %d lines in file, but got %d\n",
+	     __FILE__, __LINE__, *n_sample, i);
+    }
+  }
+
+  if(args->format == FORMAT_VCF) {    
+    /* Use sample info from VCF headers, rather than this input
+     * file. Do this because the number of samples can differ across
+     * VCF files (e.g. chrY VCF from 1000 genomes only has male
+     * samples)
+     */    
+    my_warn("ignoring sample names from --samples input file "
+	    "using sample information from VCF headers instead\n");
+  }
+  
+  return samples;
+}
+
 
 
 hid_t create_h5file(const char *filename) {
@@ -478,21 +545,21 @@ void set_file_info(gzFile gzf, char *filename, Arguments *args, FileInfo *file_i
   
   /* count total number lines in file, this tells us number of records */
   fprintf(stderr, "counting lines in file\n");
-  file_info->n_lines = util_count_lines(filename);
-  fprintf(stderr, "  total lines: %ld\n", file_info->n_lines);
+  file_info->n_line = util_count_lines(filename);
+  fprintf(stderr, "  total lines: %ld\n", file_info->n_line);
 
   if(args->format == FORMAT_VCF) {
     /* parse VCF headers */
     fprintf(stderr, "reading VCF header\n");
     vcf_read_header(gzf, vcf);
-    fprintf(stderr, "  VCF header lines: %ld\n", vcf->n_header_lines);
+    fprintf(stderr, "  VCF header lines: %ld\n", vcf->n_header_line);
     
-    file_info->n_row = file_info->n_lines - vcf->n_header_lines;
-    file_info->n_samples = vcf->n_samples;
+    file_info->n_row = file_info->n_line - vcf->n_header_line;
+    file_info->n_sample = vcf->n_sample;
   }
   else if(args->format == FORMAT_IMPUTE) {
     /* get number of samples from first row of IMPUTE file */
-    file_info->n_row = file_info->n_lines;
+    file_info->n_row = file_info->n_line;
     n_col = impute_count_fields(gzf) - IMPUTE_FIX_HEADER;
     
     /* here we assume that the file is an IMPUTE file
@@ -506,21 +573,21 @@ void set_file_info(gzFile gzf, char *filename, Arguments *args, FileInfo *file_i
 	     "multiple of 3", __FILE__, __LINE__, n_col);
     }
     
-    file_info->n_samples = n_col / 3;
+    file_info->n_sample = n_col / 3;
     file_info->n_geno_prob_col = n_col;
 
-    /* if we used haplotype file, n_samples would be n_col/2 */
-    /* file_info->n_samples = n_col / 2; */
+    /* if we used haplotype file, n_sample would be n_col/2 */
+    /* file_info->n_sample = n_col / 2; */
     
-    impute_info->n_samples = file_info->n_samples;
+    impute_info->n_sample = file_info->n_sample;
     
   } else {
     my_err("%s:%d: unknown file format\n", __FILE__, __LINE__);
   }
   
-  fprintf(stderr, "  number of samples: %ld\n", file_info->n_samples);    
-  file_info->n_geno_prob_col = file_info->n_samples * 3;
-  file_info->n_haplotype_col = file_info->n_samples * 2;  
+  fprintf(stderr, "  number of samples: %ld\n", file_info->n_sample);    
+  file_info->n_geno_prob_col = file_info->n_sample * 3;
+  file_info->n_haplotype_col = file_info->n_sample * 2;  
 }
 
 
@@ -630,8 +697,18 @@ void parse_impute(Arguments *args, Chromosome *all_chroms, int n_chrom,
   char **hap_files;
   int n_imp_files;
   int n_hap_files;
+  int n_sample;
   long missing_geno_probs;
   long n_haplotype_row;
+  SampleTab *samp_tab;
+  Sample *samples;
+
+  if(args->sample_file) {
+    samples = read_sample_info(args, &n_sample);
+  } else {
+    samples = NULL;
+    n_sample = 0;
+  }
 
   impute_info = impute_info_new();
   
@@ -670,6 +747,21 @@ void parse_impute(Arguments *args, Chromosome *all_chroms, int n_chrom,
 		    file_info.n_geno_prob_col,
 		    GENO_PROB_DATATYPE, chrom->name);
 
+      if(samples) {
+	if((n_sample*3) != file_info.n_geno_prob_col) {
+	  my_warn("%s:%d number of samples*3 (%d*3=%d) does not match "
+		  "number of genotype columns for chromosome %s "
+		  "(%d)\n",
+		  __FILE__, __LINE__, n_sample,  n_sample*3, chrom->name,
+		  file_info.n_geno_prob_col);
+	}
+	
+	/* write genotype prob sample names table for this chromosome */
+	samp_tab = sample_tab_create(gprob_info->h5file, chrom->name,
+				     samples, n_sample);
+	sample_tab_free(samp_tab);
+      }
+
       /* fill H5Matrix with default values for genotype probabilities */
       for(j = 0; j < file_info.n_geno_prob_col; j++) {
 	geno_probs[j] = GENO_PROB_DEFAULT_VAL;
@@ -692,12 +784,12 @@ void parse_impute(Arguments *args, Chromosome *all_chroms, int n_chrom,
 
     if(args->snp_index_file) {
       init_h5vector(snp_index_info, chrom->len,
-		    SNP_INDEX_DATATYPE, chrom->name);
+		    SNP_INDEX_DATATYPE, chrom->name);      
     }
 
     if(args->snp_tab_file) {
       snp_tab = snp_tab_new(snp_tab_h5file, chrom->name,
-			    file_info.n_row);
+			    file_info.n_row);      
     } else {
       snp_tab = NULL;
     }
@@ -762,7 +854,21 @@ void parse_impute(Arguments *args, Chromosome *all_chroms, int n_chrom,
 		      file_info.n_haplotype_col,
 		      HAPLOTYPE_DATATYPE, chrom->name);
 
+	if(samples) {
+	  if((n_sample*2) != file_info.n_haplotype_col) {
+	    my_warn("%s:%d number of samples*2 (%d*2=%d) does not match "
+		    "number of haplotype columns for chromosome %s "
+		    "(%d)\n", __FILE__, __LINE__, n_sample,
+		    n_sample*2, chrom->name, file_info.n_haplotype_col);
+	  }
+	
+	  /* write haplotype sample names table for this chromosome */
+	  samp_tab = sample_tab_create(haplotype_info->h5file, chrom->name,
+				       samples, n_sample);
+	  sample_tab_free(samp_tab);
+	}
 
+	
 	/* fill H5Matrix with default values for haplotypes */
 	for(j = 0; j < file_info.n_haplotype_col; j++) {
 	  haplotypes[j] = HAPLOTYPE_DEFAULT_VAL;
@@ -868,6 +974,9 @@ void parse_impute(Arguments *args, Chromosome *all_chroms, int n_chrom,
 
 
 
+
+
+
 void parse_vcf(Arguments *args, Chromosome *all_chroms, int n_chrom,
 	       H5MatrixInfo *gprob_info, H5MatrixInfo *haplotype_info,
 	       H5VectorInfo *snp_index_info, hid_t snp_tab_h5file) {
@@ -882,6 +991,7 @@ void parse_vcf(Arguments *args, Chromosome *all_chroms, int n_chrom,
   hsize_t row;
   gzFile gzf;
   Chromosome *chrom;
+  SampleTab *samp_tab;
   
   vcf = vcf_info_new();
 
@@ -912,6 +1022,18 @@ void parse_vcf(Arguments *args, Chromosome *all_chroms, int n_chrom,
       init_h5matrix(gprob_info, file_info.n_row,
 		    file_info.n_geno_prob_col,
 		    GENO_PROB_DATATYPE, chrom->name);
+
+      if((vcf->n_sample*3) != file_info.n_geno_prob_col) {
+	my_warn("%s:%d number of samples*3 (%d*3=%d) does not match "
+		"number of genotype columns for chromosome %s "
+		"(%d)\n", __FILE__, __LINE__, vcf->n_sample,
+		vcf->n_sample*3, chrom->name, file_info.n_geno_prob_col);
+      }
+      
+      /* create table of sample names for genotype probs */
+      samp_tab = sample_tab_from_names(gprob_info->h5file, chrom->name,
+				       vcf->sample_names, vcf->n_sample);
+      sample_tab_free(samp_tab);
     } else {
       geno_probs = NULL;
     }
@@ -920,6 +1042,18 @@ void parse_vcf(Arguments *args, Chromosome *all_chroms, int n_chrom,
       init_h5matrix(haplotype_info, file_info.n_row,
 		    file_info.n_haplotype_col,
 		    HAPLOTYPE_DATATYPE, chrom->name);
+
+      if((vcf->n_sample*2) != file_info.n_haplotype_col) {
+	my_warn("%s:%d number of samples (%d*2=%d) does not match "
+		"number of haplotype columns for chromosome %s "
+		"(%d)\n", __FILE__, __LINE__, vcf->n_sample,
+		vcf->n_sample*2, chrom->name, file_info.n_haplotype_col);
+      }
+      
+      /* create table of sample names for haplotypes */
+      samp_tab = sample_tab_from_names(haplotype_info->h5file, chrom->name,
+				       vcf->sample_names, vcf->n_sample);
+      sample_tab_free(samp_tab);
     } else {
       haplotypes = NULL;
     }
@@ -942,7 +1076,7 @@ void parse_vcf(Arguments *args, Chromosome *all_chroms, int n_chrom,
     } else {
       snp_tab = NULL;
     }
-    
+
     row = 0;
 
     fprintf(stderr, "parsing file and writing to HDF5 files\n");
@@ -1017,8 +1151,9 @@ void parse_vcf(Arguments *args, Chromosome *all_chroms, int n_chrom,
 
 
 
+
+
 int main(int argc, char **argv) {
-  FileInfo file_info;
   Arguments args;
   Chromosome *all_chroms;
   int n_chrom;
@@ -1032,7 +1167,7 @@ int main(int argc, char **argv) {
   all_chroms = chrom_read_file(args.chrom_file, &n_chrom);
 
   fprintf(stderr, "long alleles will be truncated to %dbp\n", SNP_MAX_ALLELE);
-  
+
   /* create new HDF5 file(s) */
   if(args.geno_prob_file) {
     gprob_info.h5file = create_h5file(args.geno_prob_file);
@@ -1070,6 +1205,8 @@ int main(int argc, char **argv) {
     
   chrom_array_free(all_chroms, n_chrom);
 
+  /* TODO: fix small mem leak: sample names never freed */
+  
   /* close HDF5 files */
   if(args.geno_prob_file) {
     H5Fclose(gprob_info.h5file);
diff --git a/snp2h5/snptab.c b/snp2h5/snptab.c
index 7951ba3..2a923b4 100644
--- a/snp2h5/snptab.c
+++ b/snp2h5/snptab.c
@@ -37,6 +37,7 @@ SNPTab *snp_tab_new(hid_t h5file, const char *chrom_name,
   H5Tset_size(tab->name_type, SNP_MAX_NAME);
   /* no longer store chromosome as each chromosome
    * gets its own table 
+   */
   /* tab->chrom_type = H5Tcopy(H5T_C_S1);
    * H5Tset_size(tab->chrom_type, SNP_MAX_CHROM);
    */
@@ -90,8 +91,6 @@ SNPTab *snp_tab_new(hid_t h5file, const char *chrom_name,
 
 
 void snp_tab_free(SNPTab *tab) {
-  int i;
-  
   H5Tclose(tab->allele_type);
   H5Tclose(tab->name_type);
   my_free(tab->title);
diff --git a/snp2h5/util.c b/snp2h5/util.c
index d763a66..04c1dde 100644
--- a/snp2h5/util.c
+++ b/snp2h5/util.c
@@ -580,6 +580,7 @@ size_t util_strncpy(char *dest, const char *src, size_t n) {
   if(n > 0) {
     while((i < n-1) && (src[i] != '\0')) {
       dest[i] = src[i];
+
       i++;
     }
     /* padd to end with \0 */
diff --git a/snp2h5/vcf.c b/snp2h5/vcf.c
index 22b9fa0..2ca003a 100644
--- a/snp2h5/vcf.c
+++ b/snp2h5/vcf.c
@@ -31,6 +31,9 @@ VCFInfo *vcf_info_new() {
   vcf_info->buf_size = 1024;
   vcf_info->buf = my_malloc(vcf_info->buf_size);
 
+  vcf_info->n_sample = 0;
+  vcf_info->sample_names = NULL;
+
   return vcf_info;
 }
 
@@ -39,6 +42,16 @@ VCFInfo *vcf_info_new() {
  * free memory allocated for reading lines
  */
 void vcf_info_free(VCFInfo *vcf_info) {
+  int i;
+  
+  if(vcf_info->sample_names) {
+    for(i = 0; i < vcf_info->n_sample; i++) {
+      my_free(vcf_info->sample_names[i]);
+    }
+    my_free(vcf_info->sample_names);
+  }
+
+  
   my_free(vcf_info->buf);
   my_free(vcf_info);
 }
@@ -47,35 +60,28 @@ void vcf_info_free(VCFInfo *vcf_info) {
 void vcf_read_header(gzFile vcf_fh, VCFInfo *vcf_info) {
   char *line, *cur, *token;
   int tok_num;
-  int n_fix_header;
+  int n_fix_header, i;
   
   /* const char delim[] = " \t"; */
   const char delim[] = "\t";
 
   n_fix_header = sizeof(vcf_fix_headers) / sizeof(const char *);
 
-  vcf_info->n_header_lines = 0;
+  vcf_info->n_header_line = 0;
   
   while(util_gzgetline(vcf_fh, &vcf_info->buf, &vcf_info->buf_size) != -1) {
-
-    /*
-    line = util_gzgets_line(vcf_fh);
-    if(line == NULL) {
-      my_err("%s:%d: could not read header information from file",
-	     __FILE__, __LINE__);
-    }
-    */
     line = vcf_info->buf;
   
     if(util_str_starts_with(line, "##")) {
       /* header line */
-      vcf_info->n_header_lines += 1;
+      vcf_info->n_header_line += 1;
     }
     else if(util_str_starts_with(line, "#CHROM")) {
       /* this should be last header line that contains list of fixed fields */
-      vcf_info->n_header_lines += 1;
+      vcf_info->n_header_line += 1;
 	
       cur = vcf_info->buf;
+      line = util_str_dup(vcf_info->buf);
       tok_num = 0;
       while((token = strsep(&cur, delim)) != NULL) {
 	if(tok_num < n_fix_header) {
@@ -86,13 +92,28 @@ void vcf_read_header(gzFile vcf_fh, VCFInfo *vcf_info) {
 	}
 	tok_num += 1;
       }
-      vcf_info->n_samples = tok_num - n_fix_header;
-      /* my_free(line); */
+      vcf_info->n_sample = tok_num - n_fix_header;
+
+      /*
+       * read sample names from remaining part of header
+       */
+      vcf_info->sample_names = my_malloc(sizeof(char *) * vcf_info->n_sample);
+      cur = line;
+      tok_num = 0;
+      i = 0;
+      while((token = strsep(&cur, delim)) != NULL) {
+	if(tok_num >= n_fix_header) {
+	  vcf_info->sample_names[i] = util_str_dup(token);
+	  i += 1;
+	}
+	tok_num += 1;
+      }
+      my_free(line);
+
       break;
     } else {
       my_err("expected last line in header to start with #CHROM");
     }
-    /* my_free(line); */
   }
 }
 
@@ -134,6 +155,7 @@ void vcf_parse_haplotypes(VCFInfo *vcf_info, char *haplotypes,
 			  char *cur) {
   int gt_idx, hap1, hap2, i, n;
   static int warn_phase = TRUE;
+  static int warn_parse = TRUE;
   long expect_haps, n_haps;
   char gt_str[VCF_MAX_FORMAT];
   
@@ -152,7 +174,7 @@ void vcf_parse_haplotypes(VCFInfo *vcf_info, char *haplotypes,
 	   __FILE__, __LINE__, vcf_info->format);
   }
   
-  expect_haps = vcf_info->n_samples * 2;
+  expect_haps = vcf_info->n_sample * 2;
   
   n_haps = 0;
   
@@ -172,14 +194,20 @@ void vcf_parse_haplotypes(VCFInfo *vcf_info, char *haplotypes,
 	  /* try with '/' separator instead */
 	  n = sscanf(inner_tok, "%d/%d", &hap1, &hap2);
 
-	  if(n == 2 && warn_phase) {
-	    my_warn("%s:%d: some genotypes are unphased (delimited "
-		    "with '/' instead of '|')\n", __FILE__, __LINE__,
-		    inner_tok);
-	    warn_phase = FALSE;
+	  if(n == 2) {
+	    if(warn_phase) {
+	      my_warn("%s:%d: some genotypes are unphased (delimited "
+		      "with '/' instead of '|')\n", __FILE__, __LINE__,
+		      inner_tok);
+	      warn_phase = FALSE;
+	    }
 	  } else {
-	    my_warn("%s:%d: could not parse genotype string '%s'\n",
-		    __FILE__, __LINE__, inner_tok);
+	    if(warn_parse) {
+	      my_warn("%s:%d: could not parse some genotype "
+		      "strings that look like: '%s'\n", __FILE__, __LINE__,
+		      inner_tok);
+	      warn_parse = FALSE;
+	    }
 	    hap1 = VCF_GTYPE_MISSING;
 	    hap2 = VCF_GTYPE_MISSING;
 	  }
@@ -219,29 +247,21 @@ void vcf_parse_haplotypes(VCFInfo *vcf_info, char *haplotypes,
 
 
 
-void vcf_parse_geno_probs(VCFInfo *vcf_info, float *geno_probs,
-		      char *cur) {
-  /* char delim[] = " \t"; */
+/**
+ * get genotype probabilities by parsing and converting genotype likelihoods
+ * (GL) from VCF line
+ */
+void vcf_parse_gl(VCFInfo *vcf_info, float *geno_probs, char *cur, long gl_idx) {
   char delim[] = "\t";
   char inner_delim[] = ":";
   char *tok, *inner_tok, *inner_cur;
   char gtype[VCF_MAX_FORMAT];
-  long gl_idx, i, n, n_geno_probs, expect_geno_probs;
+  long  i, n, n_geno_probs, expect_geno_probs;
   float like_homo_ref, like_het, like_homo_alt;
   float prob_homo_ref, prob_het, prob_homo_alt, prob_sum;
 
-  expect_geno_probs = vcf_info->n_samples * 3;
+  expect_geno_probs = vcf_info->n_sample * 3;
   
-  /* get index of GL token in format string*/
-  gl_idx = get_format_index(vcf_info->format, "GL");
-  if(gl_idx == -1) {
-    my_err("%s:%d: VCF format string does not specify GL token so cannot "
-	   "obtain genotype probabilities. Format string: '%s'.\n"
-	   "To use this file, you must run snp2h5 without "
-	   "the --geno_prob option.", __FILE__, __LINE__,
-	   vcf_info->format);
-  }
-
   n_geno_probs = 0;
   
   while((tok = strsep(&cur, delim)) != NULL) {
@@ -304,7 +324,95 @@ void vcf_parse_geno_probs(VCFInfo *vcf_info, float *geno_probs,
     my_err("%s:%d: expected %ld genotype likelihoods per line, but got "
 	   "%ld", __FILE__, __LINE__, expect_geno_probs, n_geno_probs);
   }
+}  
+
+
+/**
+ * get genotype probabilities by parsing GP token from VCF line
+ */
+void vcf_parse_gp(VCFInfo *vcf_info, float *geno_probs, char *cur, long gp_idx) {
+  char delim[] = "\t";
+  char inner_delim[] = ":";
+  char *tok, *inner_tok, *inner_cur;
+  char gtype[VCF_MAX_FORMAT];
+  long  i, n, n_geno_probs, expect_geno_probs;
+  float prob_homo_ref, prob_het, prob_homo_alt, prob_sum;
+
+  expect_geno_probs = vcf_info->n_sample * 3;
+  
+  n_geno_probs = 0;
   
+  while((tok = strsep(&cur, delim)) != NULL) {
+    /* each genotype string is delimited by ':'
+     * each GP portion is delimited by ','
+     */
+    util_strncpy(gtype, tok, sizeof(gtype));
+
+    i = 0;
+    inner_cur = gtype;
+    while((i <= gp_idx) && (inner_tok = strsep(&inner_cur, inner_delim)) != NULL) {
+      if(i == gp_idx) {
+	n = sscanf(inner_tok, "%g,%g,%g", &prob_homo_ref, &prob_het,
+		   &prob_homo_alt);
+
+	if(n != 3) {
+	  if(strcmp(inner_tok, ".") == 0) {
+	    /* '.' indicates missing data
+	     * set all probabilities to 0.333
+	     */
+	    prob_homo_ref = prob_het = prob_homo_alt = 0.333;
+	  } else {
+	    my_err("%s:%d: failed to parse genotype probabilities from "
+		   "string '%s'", __FILE__, __LINE__, inner_tok);
+	  }
+	}
+	
+	/* check that probs sum to 1.0, normalize if they don't */
+	prob_sum = prob_homo_ref + prob_het + prob_homo_alt;
+	if((prob_sum > 1.001) || (prob_sum < 0.999)) {
+	  prob_homo_ref = prob_homo_ref / prob_sum;
+	  prob_het = prob_het / prob_sum;
+	  prob_homo_alt = prob_homo_alt / prob_sum;
+     	}
+	geno_probs[n_geno_probs] = prob_homo_ref;
+	geno_probs[n_geno_probs + 1] = prob_het;
+	geno_probs[n_geno_probs + 2] = prob_homo_alt;
+
+	n_geno_probs += 3;
+      }
+
+      i++;
+    }
+  }
+
+  if(n_geno_probs != expect_geno_probs) {
+    my_err("%s:%d: expected %ld genotype probabilities per line, but got "
+	   "%ld", __FILE__, __LINE__, expect_geno_probs, n_geno_probs);
+  }
+}  
+
+
+void vcf_parse_geno_probs(VCFInfo *vcf_info, float *geno_probs, char *cur) {
+  long gl_idx, gp_idx;
+
+  /* get index of GP and GL tokens in format string */
+  gp_idx = get_format_index(vcf_info->format, "GP");
+  gl_idx = get_format_index(vcf_info->format, "GL");
+  
+  if((gl_idx == -1) && (gp_idx == -1)) {
+    my_err("%s:%d: VCF format string does not specify GL or GP token "
+	   "so cannot obtain genotype probabilities. Format string: '%s'.\n"
+	   "To use this file, you must run snp2h5 without "
+	   "the --geno_prob option.", __FILE__, __LINE__,
+	   vcf_info->format);
+  }
+
+  if(gp_idx > -1) {
+    vcf_parse_gp(vcf_info, geno_probs, cur, gp_idx);
+    return;
+  }
+
+  vcf_parse_gl(vcf_info, geno_probs, cur, gl_idx);  
 }
 
 
@@ -314,11 +422,11 @@ void vcf_parse_geno_probs(VCFInfo *vcf_info, float *geno_probs,
  *
  * If geno_probs array is non-null genotype likelihoods are parsed and
  * stored in the provided array. The array must be of length
- * n_samples*3.
+ * n_sample*3.
  *
  * If haplotypes array is non-null phased genotypes are parsed and
  * stored in the provided array. The array must be of length
- * n_samples*2.
+ * n_sample*2.
  *
  * Returns 0 on success, -1 if at EOF.
  */
@@ -328,9 +436,10 @@ int vcf_read_line(gzFile vcf_fh, VCFInfo *vcf_info, SNP *snp,
   int n_fix_header, ref_len, alt_len;
   size_t tok_num;
 
-  /* Used to allow space or tab delimiters here but now only allow tab. 
-   * This is because VCF specification indicates that fields should be tab-delimited, 
-   * and occasionally some fields contain spaces.
+  /* Used to allow space or tab delimiters here but now only allow
+   * tab.  This is because VCF specification indicates that fields
+   * should be tab-delimited, and occasionally some fields contain
+   * spaces.
    */
   /* const char delim[] = " \t";*/
   const char delim[] = "\t";
@@ -456,4 +565,6 @@ int vcf_read_line(gzFile vcf_fh, VCFInfo *vcf_info, SNP *snp,
   }
 
   /* my_free(line); */
+
+  return 0;
 }
diff --git a/snp2h5/vcf.h b/snp2h5/vcf.h
index 4522211..f101992 100644
--- a/snp2h5/vcf.h
+++ b/snp2h5/vcf.h
@@ -11,8 +11,8 @@
 #define VCF_MAX_FORMAT 1024
 
 typedef struct {
-  int n_samples;
-  long n_header_lines;
+  int n_sample;
+  long n_header_line;
 
   /* records true length of ref / alt alleles, which can be
    * truncated by limited buffer size of SNP datastructure
@@ -25,6 +25,7 @@ typedef struct {
   char info[VCF_MAX_FILTER];
   char format[VCF_MAX_FORMAT];
 
+  char **sample_names;
 
   /* used for reading lines */
   size_t buf_size;