diff --git a/flu_reference_mapping_template.ipynb b/flu_reference_mapping_template.ipynb index 7903bc8..a6dd261 100644 --- a/flu_reference_mapping_template.ipynb +++ b/flu_reference_mapping_template.ipynb @@ -33,34 +33,17 @@ "# general\n", "import os\n", "import glob\n", - "import shutil\n", - "import re\n", - "\n", - "#dates \n", - "from datetime import date\n", - "from datetime import time\n", - "from datetime import datetime\n", - "from datetime import timedelta\n", "\n", "# data\n", "import pandas as pd\n", - "import numpy as np\n", "\n", "# biopython\n", - "import Bio\n", - "from Bio import SeqIO\n", - "from Bio.SeqRecord import SeqRecord\n", - "\n", "from Bio import SeqIO\n", - "from Bio.SeqRecord import SeqRecord\n", - "from Bio.Seq import Seq\n", - "from Bio.Align.Applications import MafftCommandline\n", - "from Bio import AlignIO\n", "\n", "import subprocess\n", "\n", "DOCKER_BASE = 'docker run --rm -v ${PWD}:/data -u $(id -u):$(id -g)'\n", - "FASTQC_DOCKER = f'{DOCKER_BASE} staphb/fastqc:0.12~/flu/h5/test_h5_0005_nextseq_template_test/avrl_h5n1_250bp/seqyclean.1'\n", + "FASTQC_DOCKER = f'{DOCKER_BASE} staphb/fastqc:0.12'\n", "MULTIQC_DOCKER = f'{DOCKER_BASE} staphb/multiqc:1.8'\n", "SEQYCLEAN_DOCKER = f'{DOCKER_BASE} staphb/seqyclean:1.10.09'\n", "BWA_DOCKER = f'{DOCKER_BASE} staphb/bwa:0.7.17'\n", @@ -84,25 +67,25 @@ "outputs": [], "source": [ "# working directory absolute path \n", - "wd = '/home/sam_baird/flu/h5/test_h5_0005_nextseq_template_test/avrl_h5n1_250bp/' # e.g. /home/sam_baird/flu/h5/test_h5_0005_nextseq/avrl_h5n1_250bp/\n", + "wd = '' # e.g. /home/sam_baird/flu/h5/test_h5_0005_nextseq/avrl_h5n1_250bp/\n", "\n", - "project_name = 'test_h5_0005_nextseq' # e.g. test_h5_0005_nextseq\n", + "project_name = '' # e.g. test_h5_0005_nextseq\n", "\n", "### INPUT FILES ###\n", "# just provide the filenames, no paths\n", "\n", "# if multiple primer sets in single project, use subworkbook split by primer_set\n", - "workbook = 'test_h5_0005_nextseq_workbook_avrl_h5n1_250bp.tsv' # e.g. test_h5_0005_nextseq_workbook_avrl_h5n1_250bp.tsv\n", + "workbook = '' # e.g. test_h5_0005_nextseq_workbook_avrl_h5n1_250bp.tsv\n", "\n", - "seqyclean_contaminants_fasta = 'Adapters_plus_PhiX_174.fasta'\n", + "seqyclean_contaminants_fasta = ''\n", "\n", - "primer_bed = 'AVRL_H5N1_250bpAmpWGS_v1.bed'\n", + "primer_bed = ''\n", "\n", "# keys: short name (used for naming subdirectories and files)\n", "# values: filename\n", "# references can be either single gene segments or multifastas\n", "reference_sequences = {\n", - " 'h5n1_bovine': 'A_Bovine_texas_029328-01_UtoT.fasta', # e.g. 'h5n1_bovine': 'A_Bovine_texas_029328-01_UtoT.fasta'\n", + " '': '', # e.g. 'h5n1_bovine': 'A_Bovine_texas_029328-01_UtoT.fasta'\n", "}" ] }, @@ -537,7 +520,54 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# TODO Determine consensus sequence using ivar consensus" + "# Calculate alignment metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "aln_metrics_dir = 'alignment_metrics'\n", + "if not os.path.exists(aln_metrics_dir):\n", + " os.mkdir(aln_metrics_dir)\n", + " \n", + "for sample in sample_names:\n", + " for ref in reference_sequence_paths:\n", + " aln_dir = os.path.join('bwa_alignment_trimmed', ref)\n", + " bam_file = os.path.join(aln_dir, f'{sample}_trimmed.sorted.bam')\n", + "\n", + " out_subdir_aln_metrics = os.path.join(aln_metrics_dir, ref)\n", + " if not os.path.exists(out_subdir_aln_metrics):\n", + " os.mkdir(out_subdir_aln_metrics)\n", + "\n", + " out_bam_coverage = os.path.join(out_subdir_aln_metrics, f'{sample}_{ref}_coverage.txt')\n", + " out_bam_stats = os.path.join(out_subdir_aln_metrics, f'{sample}_{ref}_stats.txt')\n", + "\n", + " !{SAMTOOLS_DOCKER} samtools coverage -o {out_bam_coverage} {bam_file}\n", + " !{SAMTOOLS_DOCKER} sh -c \"samtools stats {bam_file} > {out_bam_stats}\"\n", + "\n", + " # Output per-segment stats for multi-segment references\n", + " segments = !{SAMTOOLS_DOCKER} samtools idxstats {bam_file} | cut -f1 | grep -v '*'\n", + " if len(segments) > 1:\n", + " for segment in segments:\n", + " out_bam_coverage = os.path.join(out_subdir_aln_metrics,\n", + " segment,\n", + " f'{sample}_{ref}_{segment}_coverage.txt')\n", + " out_bam_stats = os.path.join(out_subdir_aln_metrics,\n", + " segment,\n", + " f'{sample}_{ref}_{segment}_stats.txt')\n", + "\n", + " !{SAMTOOLS_DOCKER} samtools coverage --region {segment} -o {out_bam_coverage} {bam_file}\n", + " !{SAMTOOLS_DOCKER} sh -c \"samtools stats {bam_file} {segment} > {out_bam_stats}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Determine consensus sequence using ivar consensus" ] }, { @@ -559,23 +589,30 @@ " trimmed_sorted_bam = os.path.join(trimmed_alignment_dir, ref, f'{sample}_trimmed.sorted.bam')\n", " pileup_txt = os.path.join(trimmed_alignment_dir, ref, f'{sample}_pileup.txt')\n", "\n", - " generate pileup\n", + " # generate pileup\n", " !{SAMTOOLS_DOCKER} samtools faidx {reference_sequence_paths[ref]}\n", " !{SAMTOOLS_DOCKER} samtools mpileup -A -aa -d 600000 -B -Q 20 -q 20 -f {reference_sequence_paths[ref]} {trimmed_sorted_bam} -o {pileup_txt}\n", "\n", - " # if reference is a multifasta, write separate consensus for each gene segment\n", " records = SeqIO.parse(reference_sequence_paths[ref], format='fasta')\n", " num_records = len(list(records))\n", + " # if reference is a multifasta, write separate consensus for each gene segment\n", " if num_records > 1:\n", + " segment_fastas = []\n", " for segment in SeqIO.parse(reference_sequence_paths[ref], format='fasta'):\n", - " print('hi')\n", - " out_subdir_consensus = os.path.join(consensus_dir, ref, segment.id)\n", - " consensus_prefix = os.path.join(out_subdir_consensus, f'{sample}_{ref}_{segment.id}_consensus')\n", - " segment_id = segment.id\n", - " command = f\"cat {pileup_txt} | grep {segment_id} | ivar consensus -p {consensus_prefix} -q 20 -t 0.6 -m 10\"\n", + " out_subdir_consensus_segment = os.path.join(consensus_dir, ref, segment.id)\n", + " os.makedirs(out_subdir_consensus_segment, exist_ok=True)\n", + " consensus_prefix = os.path.join(out_subdir_consensus_segment, f'{sample}_{ref}_{segment.id}_consensus')\n", + "\n", + " command = f\"cat {pileup_txt} | grep {segment.id} | ivar consensus -p {consensus_prefix} -q 20 -t 0.6 -m 10\"\n", " print(command)\n", " !{IVAR_DOCKER} sh -c \"{command}\"\n", " \n", + " segment_fasta = f'{consensus_prefix}.fa'\n", + " segment_fastas.append(segment_fasta)\n", + " \n", + " # also output a fasta with all the gene segments\n", + " segments_fastas_str = ' '.join(segment_fastas)\n", + " !cat {segments_fastas_str} > {out_subdir_consensus}/{sample}_{ref}_consensus.fa\n", " else:\n", " out_subdir_consensus = os.path.join(consensus_dir, ref)\n", " consensus_prefix = os.path.join(out_subdir_consensus, f'{sample}_{ref}_consensus')\n", @@ -586,7 +623,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# TODO Calculate alignment metrics" + "# Calculate percent coverage" ] }, { @@ -595,33 +632,230 @@ "metadata": {}, "outputs": [], "source": [ - "aln_metrics_dir = 'alignment_metrics'\n", - "if not os.path.exists(aln_metrics_dir):\n", - " os.mkdir(aln_metrics_dir)\n", + "# reference length with and without primers\n", + "reference_lengths = {}\n", + "for ref in reference_sequence_paths:\n", + " records = SeqIO.parse(reference_sequence_paths[ref], format='fasta')\n", + " num_records = len(list(records))\n", + " bed_df = pd.read_csv(primer_bed_path, sep='\\t', header=None)\n", + " if num_records > 1:\n", + " segment_lengths = {}\n", + " for segment in SeqIO.parse(reference_sequence_paths[ref], format='fasta'):\n", + " total_length = len(segment.seq)\n", + " segment_bed_df = bed_df[bed_df[0] == segment.id]\n", + "\n", + "# Primer0 PrimerN\n", + "# ======> <=========\n", + "# sequence: A T C G G C G A T T A A A \n", + "# 0-based: 0 1 2 3 4 5 6 7 8 9 10 11 12\n", + "# 1-based: 1 2 3 4 5 6 7 8 9 10 11 12 13\n", + "\n", + "# Primer1 BED coordinates: [0,3)\n", + "# PrimerN BED coordinates: [9,13)\n", + "# Insert length: 9 - 3 = 6\n", + "\n", + " primer_insert_length = segment_bed_df[1].max() - segment_bed_df[2].min()\n", + "\n", + " segment_lengths[segment.id] = (total_length, primer_insert_length)\n", + " reference_lengths[ref] = segment_lengths\n", + " else:\n", + " total_length = len(records[0].seq)\n", + " primer_insert_length = bed_df[1].max() - bed_df[2].min()\n", + " reference_lengths[ref] = (total_length, primer_insert_length)\n", + "\n", + "reference_lengths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_percent_coverage(sample, consensus_dir, total_reference_length, primer_insert_length):\n", + " record = SeqIO.read(consensus, 'fasta')\n", + "\n", + " seq_len = len(record.seq)\n", + " number_ns = record.seq.count('N')\n", + " total_seq_len = seq_len - number_ns\n", + "\n", + " # count N's in front of sequence and N's at back of sequence\n", + " seq_str = str(record.seq)\n", + "\n", + " if re.search( '^N+', seq_str):\n", + " string = re.findall('^N+', seq_str)[0]\n", + " front_ns = len(string)\n", + " else:\n", + " front_ns = 0\n", + "\n", + " if re.search( 'N+$', seq_str):\n", + " string = re.findall('N+$', seq_str)[0]\n", + " back_ns = len(string)\n", + " else:\n", + " back_ns = 0\n", + "\n", + " percent_coverage_total_length = round(total_seq_len/total_reference_length * 100,1)\n", + " percent_coverage_primer_insert_length = round(total_seq_len/primer_insert_length * 100,1)\n", + " \n", + " consensus_path_split = consensus.split('/')\n", + " if len(consensus_path_split) == 4:\n", + " reference = f'{consensus_path_split[1]}: {consensus_path_split[2]}'\n", + " subdir = os.path.join(consensus_path_split[1], consensus_path_split[2])\n", + " else:\n", + " reference = consensus_path_split[1]\n", + " subdir = consensus_path_split[1]\n", + "\n", + " df = pd.DataFrame()\n", + " df['sample_name'] = [sample]\n", + " df['reference'] = reference\n", + " df['percent_coverage_total_length'] = percent_coverage_total_length\n", + " df['percent_coverage_primer_insert_length'] = percent_coverage_primer_insert_length\n", + " df['number_aln_bases'] = seq_len\n", + " df['ambiguous_bases'] = number_ns\n", + " df['number_aln_nonambiguous_bases'] = total_seq_len\n", + " df['front_ns'] = front_ns\n", + " df['back_ns'] = back_ns\n", " \n", + " os.makedirs(os.path.join(aln_metrics_dir, subdir), exist_ok=True)\n", + " fname = f'{consensus_path_split[-1].removesuffix(\"_consensus.fa\")}_percent_coverage.csv'\n", + " outfile = os.path.join(aln_metrics_dir, subdir, fname)\n", + " df.to_csv(outfile, index=False)\n", + "\n", + " return df\n", + "\n", + "percent_coverage_dfs = []\n", "for sample in sample_names:\n", - " # grab bam file\n", - " aln_dir = os.path.join('bwa_alignment_trimmed', ref)\n", - " bam_file = os.path.join(aln_dir, f'{sample}_trimmed.sorted.bam')\n", + " for ref in reference_lengths:\n", + " # if reference is multi-segment\n", + " if isinstance(reference_lengths[ref], dict):\n", + " for segment in reference_lengths[ref]:\n", + " consensus = os.path.join(consensus_dir, ref, segment, f'{sample}_{ref}_{segment}_consensus.fa')\n", + " percent_coverage_df = calculate_percent_coverage(\n", + " sample,\n", + " consensus,\n", + " reference_lengths[ref][segment][0],\n", + " reference_lengths[ref][segment][1],\n", + " )\n", + " percent_coverage_dfs.append(percent_coverage_df)\n", + " \n", + " else:\n", + " consensus = os.path.join(consensus_dir, ref, f'{sample}_{ref}_consensus.fa')\n", + " percent_coverage_df = calculate_percent_coverage(\n", + " sample,\n", + " consensus,\n", + " reference_lengths[ref][0],\n", + " reference_lengths[ref][1],\n", + " )\n", + " percent_coverage_dfs.append(percent_coverage_df)\n", + " \n", + "concat_percent_coverage_df = pd.concat(percent_coverage_dfs).reset_index(drop=True)\n", + "concat_percent_coverage_outfile = os.path.join(aln_metrics_dir, f'{project_name}_percent_coverage.csv')\n", + "concat_percent_coverage_df.to_csv(concat_percent_coverage_outfile, index=False)\n", + "concat_percent_coverage_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summarize coverage data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "def summarize_coverage(sample, stats_dir):\n", + " stats_dir_split = stats_dir.split('/')\n", + " if len(stats_dir_split) == 3:\n", + " # ref_segment\n", + " ref_str = f'{stats_dir_split[1]}_{stats_dir_split[2]}'\n", + " else:\n", + " # just ref\n", + " ref_str = f'{stats_dir_split[1]}'\n", + " \n", + " bam_coverage = os.path.join(stats_dir, f'{sample}_{ref_str}_coverage.txt')\n", + " bam_cov_df = pd.read_csv(bam_coverage, sep = '\\t')\n", + "\n", + " # pull out number of reads mapped and use with fastqc to calculate \n", + " # the number of unmapped reads and the percent of reads mapped\n", + " bam_stats = os.path.join(stats_dir, f'{sample}_{ref_str}_stats.txt')\n", + " with open (bam_stats, 'r') as file:\n", + " for line in file:\n", + " if re.search('reads mapped:', line):\n", + " reads_mapped = float(line.split('\\t')[2].strip())\n", + "\n", + "\n", + " # pull out number of reads from fastqc output\n", + " fastqc_metrics_file = os.path.join(wd, \"fastqc_clean\", sample, f'{sample}_summary_metrics.tsv')\n", + " fastqc_df = pd.read_csv(fastqc_metrics_file, sep = '\\t')\n", + " total_raw_reads = fastqc_df['r1_total_reads'][0] + fastqc_df['r2_total_reads'][0]\n", + "\n", + " # calcuate the number of unmapped reads and the percent reads mapped\n", + " if total_raw_reads != 0:\n", + " percent_reads_mapped = round(reads_mapped/total_raw_reads * 100, 1)\n", + " else:\n", + " percent_reads_mapped = 0\n", "\n", - " # set outfile\n", - " out_subdir_aln_metrics = os.path.join(aln_metrics_dir, ref)\n", - " if not os.path.exists(out_subdir_aln_metrics):\n", - " os.mkdir(out_subdir_aln_metrics)\n", + " reads_unmapped = total_raw_reads - reads_mapped\n", "\n", - " out_bam_coverage = os.path.join(out_subdir_aln_metrics, f'{sample}_coverage.txt')\n", - " out_bam_stats = os.path.join(out_subdir_aln_metrics, f'{sample}_stats.txt')\n", + " # pull out percent coverage 1 and 2 from percent coverage\n", + " percent_coverage_file = os.path.join(stats_dir, f'{sample}_{ref_str}_percent_coverage.csv')\n", + " percent_df = pd.read_csv(percent_coverage_file)\n", "\n", - " # run samtools\n", - " ! {SAMTOOLS_DOCKER} samtools coverage -o {out_bam_coverage} {bam_file}\n", - " ! {SAMTOOLS_DOCKER} sh -c \"samtools stats {bam_file} > {out_bam_stats}\"" + "\n", + " df = pd.DataFrame()\n", + " df['sample_name'] = [sample]\n", + " df['reference'] = [percent_df['reference'].iloc[0]]\n", + "\n", + " df['percent_reads_mapped'] = percent_reads_mapped\n", + " df['total_raw_reads'] = total_raw_reads\n", + " df['reads_mapped'] = reads_mapped\n", + " df['reads_unmapped'] = reads_unmapped\n", + " df['av_depth'] = [bam_cov_df['meandepth'].iloc[0]]\n", + " df['meanmapq'] = [bam_cov_df['meanmapq'].iloc[0]]\n", + "\n", + " df['percent_coverage_total_length'] = [percent_df['percent_coverage_total_length'].iloc[0]]\n", + " df['percent_coverage_primer_insert_length'] = [percent_df['percent_coverage_primer_insert_length'].iloc[0]]\n", + " df['front_ns'] = [percent_df['front_ns'].iloc[0]]\n", + " df['back_ns'] = [percent_df['back_ns'].iloc[0]]\n", + " \n", + " outfile = os.path.join(stats_dir, f'{sample}_{ref_str}_aln_metrics_summary.csv')\n", + "\n", + " df.to_csv(outfile, index=False)\n", + "\n", + " return df\n", + "\n", + "coverage_summary_dfs = []\n", + "for sample in sample_names:\n", + " for ref in reference_sequence_paths:\n", + " records = SeqIO.parse(reference_sequence_paths[ref], format='fasta')\n", + " num_records = len(list(records))\n", + " if num_records > 1:\n", + " for segment in SeqIO.parse(reference_sequence_paths[ref], format='fasta'):\n", + " stats_dir = os.path.join(aln_metrics_dir, ref, segment.id)\n", + " coverage_summary_df = summarize_coverage(sample, stats_dir)\n", + " coverage_summary_dfs.append(coverage_summary_df)\n", + " else:\n", + " stats_dir = os.path.join(aln_metrics_dir, ref)\n", + " coverage_summary_df = summarize_coverage(sample, stats_dir)\n", + " coverage_summary_dfs.append(coverage_summary_df)\n", + "\n", + "concat_coverage_summary_df = pd.concat(coverage_summary_dfs).reset_index(drop=True)\n", + "concat_coverage_summary_outfile = os.path.join(aln_metrics_dir, f'{project_name}_aln_metrics_summary.csv')\n", + "concat_coverage_summary_df.to_csv(concat_coverage_summary_outfile, index=False)\n", + "concat_coverage_summary_df " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# TODO Calculate percent coverage" + "# TODO Calculate per-amplicon non-overlapping coverage" ] }, { @@ -630,31 +864,115 @@ "metadata": {}, "outputs": [], "source": [ - "# reference length with and without primers\n", - "# e.g.\n", - "# reference_lengths = {\n", - "reference_lengths = {}\n", - "for ref in reference_sequence_paths:\n", - " records = SeqIO.parse(reference_sequence_paths[ref])\n", - " num_records = len(list(records))\n", - " bed_df = pd.read_csv(bed_file_path, sep='\\t', header=None)\n", - " if num_records > 1:\n", - " segment_lengths = {}\n", - " for segment in records:\n", - " total_length = len(segment.seq)\n", - " segment_bed_df = bed_df[bed_df[0] == segment.id]\n", - "\n", - " # Span of primers is assumed to be the (highest end coordinate) - (lowest start coordinate) \n", - " primer_covered_length = segment_bed_df[2].max() - segment_bed_df[1].min()\n", + "# note: expects exactly two primers per amplicon\n", + "amplicon_dir = 'amplicon_metrics'\n", + "if not os.path.exists(amplicon_dir):\n", + " os.mkdir(amplicon_dir)\n", "\n", - " segment_lengths[segment.id] = (total_length, primer_covered_length)\n", - " reference_lengths[ref] = segment_lengths\n", + "def extract_amplicon_name(primer_name):\n", + " last_separator = max(primer_name.rfind('-'), primer_name.rfind('_'))\n", + " if last_separator != -1:\n", + " amplicon = primer_name[:last_separator]\n", " else:\n", - " total_length = len(records[0].seq)\n", - " primer_covered_length = bed_df[2].max() - bed_df[1].min()\n", - " reference_lengths[ref] = (total_length, primer_covered_length)\n", + " raise ValueError(f'Could not parse amplicon name of primer {primer_name}')\n", + " return amplicon\n", "\n", - "reference_lengths" + "bed_df['amplicon'] = bed_df[3].apply(extract_amplicon_name)\n", + "\n", + "# amplicon number assumed to be last string of digits in amplicon_name\n", + "bed_df['amplicon_number'] = bed_df[3].str.extract(r'(\\d+)(?!.*\\d)', expand=False)\n", + "bed_df['amplicon_number'] = bed_df['amplicon_number'].astype(int)\n", + "\n", + "primers = list(bed_df[3])\n", + "amplicons = list(bed_df['amplicon'].unique())\n", + "if len(primers) / len(amplicons) != 2:\n", + " raise ValueError('Either more than two primers per amplicon or cannot properly parse primer names in BED')\n", + "\n", + " \n", + "# Per-amplicon non-overlapping coverage (after primer trimming)\n", + "\n", + "# Primer 0L 1L 0R 2L 1R 3L 2R 3R\n", + "# ===> ===> <=== ===> <=== ===> <=== <=== \n", + "# Amplicon ---Amp0--- ------Amp1------ ------Amp2------ ---Amp3---\n", + "# sequence: T T G G G T G A C T G A A C C A T T G G A A G A A A C\n", + "# 0-based: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26\n", + "# 1-based: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27\n", + "\n", + "# Formulas for Amp0, Amp1, Amp2, ... AmpN:\n", + " \n", + "amplicon_df = pd.DataFrame({'amplicon': amplicons})\n", + "for sample in sample_names:\n", + " print()\n", + " for ref in reference_sequence_paths:\n", + " amplicon_subdir = os.path.join(amplicon_dir, ref)\n", + " if not os.path.exists(amplicon_subdir):\n", + " os.mkdir(amplicon_subdir)\n", + " out_file = os.path.join(amplicon_subdir, f'{sample}_amplicon_coverage.txt')\n", + "\n", + " trimmed_sorted_bam = os.path.join(trimmed_alignment_dir, ref, f'{sample}_trimmed.sorted.bam')\n", + "\n", + " for i, amplicon in enumerate(amplicons):\n", + " region_name = bed_df[bed_df['amplicon'] == amplicon][0].iloc[0]\n", + " amplicon_number = bed_df[bed_df['amplicon'] == amplicon]['amplicon_number'].iloc[0]\n", + " \n", + " # assume first amplicon is first in bed\n", + " if amplicon == bed_df[bed_df[0] == region_name].iloc[0]['amplicon']:\n", + " # (Amp0 1-based coords) = [((Primer0L BED end) + 1), (Primer1L BED end)]\n", + " region_start = bed_df[\n", + " (bed_df[0] == region_name)\n", + " & (bed_df['amplicon_number'] == amplicon_number)\n", + " & (bed_df[5] == '+')\n", + " ][2].values[0] + 1\n", + "\n", + " region_end = bed_df[\n", + " (bed_df[0] == region_name)\n", + " & (bed_df['amplicon_number'] == amplicon_number + 1)\n", + " & (bed_df[5] == '+')\n", + " ][2].values[0]\n", + " \n", + " # assume last amplicon is last in bed\n", + " elif amplicon == bed_df[(bed_df[0] == region_name)].iloc[-1]['amplicon']:\n", + " # (AmpN 1-based coords) = [((Primer(N-1)R BED start) + 1), (PrimerNR BED start)]\n", + " region_start = bed_df[\n", + " (bed_df[0] == region_name)\n", + " & (bed_df['amplicon_number'] == amplicon_number - 1)\n", + " & (bed_df[5] == '-')\n", + " ][1].values[0] + 1\n", + "\n", + " region_end = bed_df[\n", + " (bed_df[0] == region_name)\n", + " & (bed_df['amplicon_number'] == amplicon_number)\n", + " & (bed_df[5] == '-')\n", + " ][1].values[0]\n", + " \n", + " else:\n", + " # (AmpX 1-based coords) = [((Primer(X-1)R BED start) + 1), (Primer(X+1)L BED end)]\n", + " region_start = bed_df[\n", + " (bed_df[0] == region_name)\n", + " & (bed_df['amplicon_number'] == amplicon_number - 1)\n", + " & (bed_df[5] == '-')\n", + " ][1].values[0] + 1\n", + "\n", + " region_end = bed_df[\n", + " (bed_df[0] == region_name)\n", + " & (bed_df['amplicon_number'] == amplicon_number + 1)\n", + " & (bed_df[5] == '+')\n", + " ][2].values[0]\n", + " \n", + " region = f'{region_name}:{region_start}-{region_end}'\n", + " \n", + " if region_start > region_end:\n", + " print(f'Skipping {sample} {amplicon}, unusual primer order')\n", + " continue\n", + "\n", + " if i == 0:\n", + " !{SAMTOOLS_DOCKER} sh -c \"samtools coverage --region {region} {trimmed_sorted_bam} > {out_file}\"\n", + " else:\n", + " !{SAMTOOLS_DOCKER} sh -c \"samtools coverage --region {region} {trimmed_sorted_bam} | tail -n 1 >> {out_file}\"\n", + "\n", + "pd.set_option('display.max_rows', 500)\n", + "\n", + "bed_df" ] } ],