Merge branch 'main' into 24-add-server-feature

bihealth · Jan 5, 2024 · 11e33c4 · 11e33c4
2 parents c9d6173 + c5497aa
commit 11e33c4
Show file tree

Hide file tree

Showing 8 changed files with 61 additions and 37 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+### [0.8.1](https://www.github.com/bihealth/ngs-chew/compare/v0.8.0...v0.8.1) (2023-07-18)
+
+
+### Bug Fixes
+
+* sites for GRCh38 ([#24](https://www.github.com/bihealth/ngs-chew/issues/24)) ([#39](https://www.github.com/bihealth/ngs-chew/issues/39)) ([79773ef](https://www.github.com/bihealth/ngs-chew/commit/79773ef0e829184e763aee61373ae46f6ea83ed5))
+
 ## [0.8.0](https://www.github.com/bihealth/ngs-chew/compare/v0.7.1...v0.8.0) (2023-01-16)
 
 

diff --git a/chew/cli.py b/chew/cli.py
@@ -23,7 +23,7 @@ def cli(ctx: click.Context, verbose: bool):
     ctx.obj["verbose"] = verbose
 
 
-@cli.command("fingerprint", help="Compute fingerprint to numpy .npz files.")
+@cli.command("fingerprint", help="Compute fingerprint to numpy .npz files.")  # type: ignore[attr-defined]
 @click.option("--min-coverage", type=int, default=5, help="Minimal required coverage.")
 @click.option("--reference", required=True, help="Path to reference FASTA file.")
 @click.option(
@@ -95,7 +95,7 @@ def cli_fingerprint(
     fingerprint.run(config)
 
 
-@cli.command("compare", help="Perform fingeprint comparison.")
+@cli.command("compare", help="Perform fingeprint comparison.")  # type: ignore[attr-defined]
 @click.option(
     "--output-prefix", type=str, default="chew-comparison", help="Path to comparison file."
 )
@@ -120,7 +120,7 @@ def cli_compare(
     compare.run(config)
 
 
-@cli.command("stats", help="Compute statistics from fingerprint .npz files.")
+@cli.command("stats", help="Compute statistics from fingerprint .npz files.")  # type: ignore[attr-defined]
 @click.option("--output", default="chew-stats.txt", help="Path to output file.")
 @click.argument("fingerprints", nargs=-1)
 @click.pass_context
@@ -137,7 +137,7 @@ def cli_stats(
     stats.run(config)
 
 
-@cli.command("plot-compare", help="Plot result of 'ngs-chew compare'.")
+@cli.command("plot-compare", help="Plot result of 'ngs-chew compare'.")  # type: ignore[attr-defined]
 @click.option(
     "--title", default="NGS Chew Comparison Plot", help="title to use for the output HTML file."
 )
@@ -159,7 +159,7 @@ def cli_plot_compare(
     plot_compare.run(config)
 
 
-@cli.command("plot-var-het", help="Plot var(het) metric from .npz files.")
+@cli.command("plot-var-het", help="Plot var(het) metric from .npz files.")  # type: ignore[attr-defined]
 @click.option(
     "--title", default="NGS Chew var(het) Plot", help="title to use for the output HTML file."
 )

diff --git a/chew/common.py b/chew/common.py
@@ -1,5 +1,12 @@
 """Commonly used code"""
 
+import gzip
+import os
+import typing
+
+import attrs
+from logzero import logger
+
 #: Chromosome lengths in GRCh37.
 import enum
 import typing
@@ -112,3 +119,33 @@ def pedigree_member_from_tsv(arr: typing.List[str]) -> PedigreeMember:
         sex=PED_SEX_MAP.get(arr[4], Sex.UNKNOWN),
         disease_state=PED_DISEASE_MAP.get(arr[5], DiseaseState.UNKNOWN),
     )
+
+
+@attrs.frozen
+class Site:
+    chrom: str
+    pos: int
+    ref: str
+    alt: str
+
+
+def load_sites(genome_release: str) -> typing.List[Site]:
+    logger.info("Loading sites .bed.gz for %s", genome_release)
+    path_gz = os.path.join(os.path.dirname(__file__), "data", f"{genome_release}_sites.bed.gz")
+    result = []
+    with gzip.open(path_gz, "rt") as inputf:
+        for line in inputf:
+            arr = line.split("\t")
+            # We construct sites with arbitrary REF/ALT as bcftools roh does not
+            # interpret them but only GT.
+            result.append(
+                Site(
+                    chrom=arr[0],
+                    pos=int(arr[1]) + 1,
+                    ref="N",
+                    alt="A",
+                )
+            )
+    logger.info("  finished reading %d sites", len(result))
+    logger.info("  first = %s, last = %s", result[0], result[-1])
+    return result
diff --git a/chew/data/GRCh38_sites.bed.gz b/chew/data/GRCh38_sites.bed.gz
diff --git a/chew/data/GRCh38_sitesX.bed.gz b/chew/data/GRCh38_sitesX.bed.gz
diff --git a/chew/fingerprint.py b/chew/fingerprint.py
@@ -16,7 +16,12 @@
 
 import chew
 from chew import roh
-from chew.common import CHROM_LENS_GRCH37, CHROM_LENS_GRCH38, CHROM_LENS_HG19
+from chew.common import (
+    CHROM_LENS_GRCH37,
+    CHROM_LENS_GRCH38,
+    CHROM_LENS_HG19,
+    load_sites,
+)
 
 #: Key to use for GRCh37 release.
 RELEASE_37 = "GRCh37"
@@ -155,12 +160,10 @@ def snps_step_call(
 ):
     bed_file = f"{genome_release}_{sites_suffix}.bed.gz"
     logger.info("Reading sites BED (%s)...", bed_file)
-    path_gz = os.path.join(os.path.dirname(__file__), "data", bed_file)
-    with gzip.open(path_gz, "rt") as inputf:
-        sites = {}
-        for line in inputf:
-            arr = line.strip().split("\t")
-            sites["%s%s:%s" % (chr_prefix, arr[0], int(arr[1]) + 1)] = (0, 0, float("nan"))
+    sites = {
+        "%s%s:%s" % (chr_prefix, site.chrom, site.pos): (0, 0, float("nan"))
+        for site in load_sites(genome_release)
+    }
     logger.info("Converting VCF to fingerprint...")
     with vcfpy.Reader.from_path(path_calls) as vcf_reader:
         if prefix_fingerprint:

diff --git a/chew/roh.py b/chew/roh.py
@@ -1,15 +1,13 @@
-import gzip
 import os
 import shlex
 import subprocess
-import typing
 
 import attrs
 from logzero import logger
 import numpy.typing as npt
 import vcfpy
 
-from chew.common import CHROM_LENS_GRCH37, CHROM_LENS_GRCH38
+from chew.common import CHROM_LENS_GRCH37, CHROM_LENS_GRCH38, load_sites
 
 #: Use this value for genotype likelyhood (FORMAT/PL)
 BCFTOOLS_ROH_PL = 30
@@ -43,27 +41,6 @@ class Site:
     alt: str
 
 
-def load_sites(genome_release: str) -> typing.List[Site]:
-    logger.info("Loading sites .bed.gz")
-    path_gz = os.path.join(os.path.dirname(__file__), "data", f"{genome_release}_sites.bed.gz")
-    result = []
-    with gzip.open(path_gz, "rt") as inputf:
-        for lineno, line in enumerate(inputf):
-            arr = line.split("\t")
-            # We construct sites with arbitrary REF/ALT as bcftools roh does not
-            # interpret them but only GT.
-            result.append(
-                Site(
-                    chrom=arr[0],
-                    pos=int(arr[1]) + 1,
-                    ref="N",
-                    alt="A",
-                )
-            )
-    logger.info("  finished reading %d sites", len(result))
-    return result
-
-
 def create_vcf_header(sample: str, release: str) -> vcfpy.Header:
     if release == "GRCh37":
         chrom_lens = CHROM_LENS_GRCH37

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -8,4 +8,4 @@ pytest-cov
 pytest-click
 pyfakefs >=3.7.1
 
-mypy ==0.990
+mypy ==1.4.1