Skip to content

Commit

Permalink
wip force-include human cases
Browse files Browse the repository at this point in the history
  • Loading branch information
jameshadfield committed Dec 11, 2024
1 parent 924b7cb commit 87b127a
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 4 deletions.
6 changes: 4 additions & 2 deletions rules/cattle-flu.smk
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ rule join_segments:
# allow snakemake to choose the correct rule to run. Note that `wildcards.segment="genome"`
# here, and for that we need alignments for 8 individual segments, which we refer to as `wildcards.genome_seg`
input:
alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS)
alignment = expand("results/{{subtype}}/{{segment}}/{{time}}/aligned_{genome_seg}.fasta", genome_seg=SEGMENTS),
metadata = metadata_by_wildcards,
output:
alignment = "results/{subtype}/{segment}/{time}/aligned.fasta",
node_data = "results/{subtype}/{segment}/{time}/aligned.json",
Expand All @@ -77,7 +78,8 @@ rule join_segments:
python scripts/join-segments.py \
--segments {input.alignment} \
--output {output.alignment} \
--output-node-data {output.node_data}
--output-node-data {output.node_data} \
--force-include $( cat {input.metadata} | csvtk filter2 -t -f '$host=="Human"' | csvtk cut -t -f strain | tail -n +2 | tr "\n" " " )
"""

rule genome_metadata:
Expand Down
9 changes: 7 additions & 2 deletions scripts/join-segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--segments', type = str, required = True, nargs='+', help = "per-segment alignments")
parser.add_argument('--force-include', type=str, nargs="+", required=False,
help="Force include these strains regardless of how many segments have been sequenced")
parser.add_argument('--output', type = str, required = True, help = "output whole genome alignment")
parser.add_argument('--output-node-data', type = str, required = False, help = "output metadata in node-data JSON format")
args = parser.parse_args()
Expand Down Expand Up @@ -51,8 +53,11 @@ def atgc_perc(seq):
print("writing genome to ", args.output)
for name,count in strain_counts.items():
if count<7:
print(f"Excluding {name} as it only appears in {count} segments")
continue
if name in args.force_include:
print(f"Force including {name} which would otherwise be dropped as it only appears in {count} segments")
else:
print(f"Excluding {name} as it only appears in {count} segments")
continue
genome = "".join([sequence(seg, name) for seg in args.segments])
node_data['nodes'][name] = {
"ATGC_perc": int( len([nt for nt in genome if nt in atgc])/len(genome) * 100),
Expand Down

0 comments on commit 87b127a

Please sign in to comment.