Skip to content

Commit

Permalink
feat: implementing web service for resolving to SPDI (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Oct 4, 2023
1 parent 717afe9 commit 8e76ced
Show file tree
Hide file tree
Showing 23 changed files with 450 additions and 46 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
tests/data/*.json* filter=lfs diff=lfs merge=lfs -text
tests/data/seqrepo/** filter=lfs diff=lfs merge=lfs -text
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
__pycache__

/data

# Coverage information.
.coverage
coverage.lcov
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,5 @@ ci: \

.PHONY: serve
serve:
DATA_DIR=$(PWD)/tests/data \
pipenv run uvicorn dotty.main:app --host 0.0.0.0 --port 8080 --reload --workers 8
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ name = "pypi"
cdot = "*"
fastapi = "*"
hgvs = "*"
pydantic-settings = "*"
uvicorn = "*"

[dev-packages]
black = "*"
Expand All @@ -15,6 +17,7 @@ isort = "*"
mypy = "*"
pytest = "*"
pytest-coverage = "*"
httpx = "*"

[requires]
python_version = "3.10"
105 changes: 102 additions & 3 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,28 @@

# dotty - cdot-based position projection

## Obtaining Data

`datasets` is the NCBI `datasets` tool.

```
$ mkdir -p data
$ cd data
$ wget \
https://github.com/SACGF/cdot/releases/download/v0.2.21/cdot-0.2.21.ensembl.grch37.json.gz \
https://github.com/SACGF/cdot/releases/download/v0.2.21/cdot-0.2.21.ensembl.grch38.json.gz \
https://github.com/SACGF/cdot/releases/download/v0.2.21/cdot-0.2.21.refseq.grch37.json.gz \
https://github.com/SACGF/cdot/releases/download/v0.2.21/cdot-0.2.21.refseq.grch38.json.gz
$ download genome accession GCF_000001405.25 --filename GRCh37.zip
$ download genome accession GCF_000001405.40 --filename GRCh38.zip
$ unzip GRCh37.zip
$ unzip GRCh38.zip
$ seqrepo --root-directory $PWD load --namespace ncbi --instance-name seqrepo ncbi_dataset/data/GCF_000001405.*/*.fna
$ rm -rf GRCh3?.zip ncbi_dataset
```

## Terraform Project Management

```
Expand Down
29 changes: 29 additions & 0 deletions dotty/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import logging
import os
import secrets
from typing import Any

from pydantic import AnyHttpUrl, BaseModel, EmailStr, HttpUrl, PostgresDsn, field_validator
from pydantic_core.core_schema import ValidationInfo
from pydantic_settings import BaseSettings, SettingsConfigDict

logger = logging.getLogger(__name__)


class Settings(BaseSettings):
"""Configuration of dotty web server."""

#: Enable loading variables from ``.env`` files.
model_config = SettingsConfigDict(
env_file=".env", env_file_encoding="utf-8", case_sensitive=True
)

#: Path to the directory with the cdot ``.json.gz`` files.
DATA_DIR: str = "/data"

#: Whether seqrepo is available for the reference, allows normalization
#: of reference-level variants.
HAVE_SEQREPO: bool = True


settings = Settings(_env_file=".env", _env_file_encoding="utf-8") # type: ignore[call-arg]
75 changes: 44 additions & 31 deletions dotty/core.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,34 @@
import enum
import logging
import os
import pathlib
import time
from datetime import timedelta
from unittest import mock

import bioutils.assemblies
import hgvs.parser
from cdot.hgvs.dataproviders import JSONDataProvider
from hgvs.assemblymapper import AssemblyMapper
from hgvs.dataproviders.seqfetcher import SeqFetcher
from hgvs.exceptions import HGVSDataNotAvailableError
from hgvs.dataproviders.interface import Interface
from hgvs.extras import babelfish

from dotty.config import settings

#: Logger used in this module.
_logger = logging.getLogger(__name__)


class Babelfish(babelfish.Babelfish):
"""Custom Babelfish that also knows about GRCh37."""

def __init__(self, hdp: Interface, assembly_name: str):
super().__init__(hdp, assembly_name)
for assembly_name in ("GRCh37", "GRCh38"):
for sr in bioutils.assemblies.get_assembly(assembly_name)["sequences"]:
self.ac_to_chr_name_map[sr["refseq_ac"]] = sr["name"]


class Assembly(enum.Enum):
"""Enumeration for supported assemblies."""

Expand All @@ -23,17 +38,6 @@ class Assembly(enum.Enum):
GRCH38 = "GRCh38"


class NullSeqFetcher(SeqFetcher):
"""A null sequence fetcher that always returns None."""

def __init__(self):
_logger.info("(Not) fetching sequences with NullSeqFetcher")

def fetch_seq(self, ac, start_i=None, end_i=None): # pragma: no cover
_ = ac, start_i, end_i
raise HGVSDataNotAvailableError("dotty cannot fetch sequences")


class Driver:
"""Provides references to the data files."""

Expand All @@ -56,30 +60,39 @@ def __init__(self, cdot_dir: str):
self.data_providers: dict[Assembly, JSONDataProvider] = {}
#: The assembly mapper to use for each genome.
self.assembly_mappers: dict[Assembly, AssemblyMapper] = {}
#: One Babelfish for each assembly.
self.babelfishes: dict[Assembly, Babelfish] = {}
#: The HGVS parser.
self.parser = hgvs.parser.Parser()

def load(self):
"""Loads the data from the files."""
_logger.info("Loading data from %s: %s ...", self.cdot_dir, self.assembly_file_names)
start_time = time.time()
self.data_providers = {
assembly: JSONDataProvider(
[str(self.cdot_dir / fname) for fname in assembly_file_names],
seqfetcher=NullSeqFetcher(),
)
for assembly, assembly_file_names in self.assembly_file_names.items()
}
self.assembly_mappers = {
assembly: AssemblyMapper(
self.data_providers[assembly],
assembly_name=assembly.value,
alt_aln_method="splign",
normalize=False,
replace_reference=False,
prevalidation_level=None,
)
for assembly in self.assembly_file_names
}

# We temporarily override the HGVS_SEQREPO_DIR environment variable for construction
# of hgvs / cdot objects.
with mock.patch.dict(os.environ, {"HGVS_SEQREPO_DIR": str(self.cdot_dir / "seqrepo")}):
self.data_providers = {
assembly: JSONDataProvider(
[str(self.cdot_dir / fname) for fname in assembly_file_names],
)
for assembly, assembly_file_names in self.assembly_file_names.items()
}
self.assembly_mappers = {
assembly: AssemblyMapper(
self.data_providers[assembly],
assembly_name=assembly.value,
alt_aln_method="splign",
normalize=settings.HAVE_SEQREPO,
replace_reference=settings.HAVE_SEQREPO,
prevalidation_level=None,
)
for assembly in Assembly
}
self.babelfishes = {
assembly: Babelfish(hdp=self.data_providers[assembly], assembly_name=assembly.value)
for assembly in Assembly
}
elapsed = timedelta(seconds=time.time() - start_time)
_logger.info("... loaded in %s", elapsed)
Loading

0 comments on commit 8e76ced

Please sign in to comment.