diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..86430d7 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,8 @@ +[bumpversion] +current_version = 0.1.0 +commit = True +tag = True + +[bumpversion:file:README.md] + +[bumpversion:file:dna_mutator/version.py] diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..7a21eb1 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,25 @@ +name: build + +on: [push, workflow_dispatch] + +jobs: + build: + + runs-on: ubuntu-22.04 + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: '3.9' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-cov + - name: Test pip installation + run: | + pip install -e . + - name: Test with pytest + run: | + python -m pytest --cov dna_mutator diff --git a/.gitignore b/.gitignore index 82f9275..b6e4761 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ parts/ sdist/ var/ wheels/ +pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg @@ -49,7 +50,6 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ -cover/ # Translations *.mo @@ -72,7 +72,6 @@ instance/ docs/_build/ # PyBuilder -.pybuilder/ target/ # Jupyter Notebook @@ -83,9 +82,7 @@ profile_default/ ipython_config.py # pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -94,24 +91,7 @@ ipython_config.py # install all needed dependencies. #Pipfile.lock -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +# PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff @@ -147,16 +127,3 @@ dmypy.json # Pyre type checker .pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..beb22c6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Edinburgh Genome Foundry + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 058d576..9b561c2 100644 --- a/README.md +++ b/README.md @@ -1 +1,41 @@ -# Mutator \ No newline at end of file +
+ +
+ +# DNA Mutator + +![version](https://img.shields.io/badge/current_version-0.1.0-blue) +[![build](https://github.com/Edinburgh-Genome-Foundry/dna_mutator/actions/workflows/build.yml/badge.svg)](https://github.com/Edinburgh-Genome-Foundry/dna_mutator/actions/workflows/build.yml) + +Create variants of DNA sequences. + +This repository is based on the software code of a dissertation project (B237870) for the MSc Bioinformatics program at the University of Edinburgh. + +## Install + +```bash +pip install git+https://github.com/Edinburgh-Genome-Foundry/mutator.git +``` + +## Usage + +```python +import dna_mutator as mutator +record = mutator.Mutator.read_genbank("EGF.gb") +mut = mutator.Mutator(record) +mut.DelN() +mut.write_all_records("variants") +``` + +## Versioning + +DNA Mutator uses the [semantic versioning](https://semver.org) scheme. + +## License = MIT + +DNA Mutator is free/libre and open-source software, which means the users have the freedom to run, study, change and distribute the software. + +DNA Mutator was written at the [Edinburgh Genome Foundry](https://edinburgh-genome-foundry.github.io/) +by [B237870](https://github.com/B237870-2024) and [Peter Vegh](https://github.com/veghp). + +Copyright 2024 Edinburgh Genome Foundry, University of Edinburgh diff --git a/dna_mutator/Mutator.py b/dna_mutator/Mutator.py new file mode 100644 index 0000000..479656f --- /dev/null +++ b/dna_mutator/Mutator.py @@ -0,0 +1,148 @@ +import os +import random + +import pandas + +from Bio import SeqIO +from Bio.Seq import Seq, MutableSeq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation + + +class Mutator: + """Class to generate simulations of structural and single nucleotide variants. + + + **Parameters** + + **reference** + > A `SeqRecord` instance. + + **library_size** + > Library size (`int`). + """ + + def __init__(self, reference, library_size=10): + self.reference = reference + self.library_size = library_size + self.variant_records = [] + + def write_sample_sheet(self, csv_file): + """Create a sample sheet (for use with Sequeduct)""" + barcode_dir = [ + "barcode" + ("{0:02d}".format(i + 1)) + for i in range(len(self.variant_records)) + ] + + variants = [variant_record.id for variant_record in self.variant_records] + df_variants = pandas.DataFrame({"Sample": variants, "Barcode_dir": barcode_dir}) + df_variants.to_csv(csv_file, index=False) + + @staticmethod + def subtract_bases(seq, pos, n): + """Substract N bases from a sequence + + + **Parameters** + + **seq** + > `Seq` instance. + + **pos** + > Location of change (`int`). + + **n** + > Number of bases to subtract (`int`). + """ + modified_sequence = MutableSeq(seq) + deleted_sequence = modified_sequence[pos : pos + n] + del modified_sequence[pos : pos + n] + + return ( + modified_sequence, + pos, + deleted_sequence, + ) + + @staticmethod + def get_random_pos(record, n=1): + """Get n different random positions in a record""" + positions = random.sample(range(0, len(record)), n) + + return positions + + @staticmethod + def read_genbank(genbank, use_file_name_as_id=True): + """Get the reference sequence and features from input file + + **Parameters** + + **genbank** + > Path to Genbank file (`str`). + + **use_file_name_as_id** + > Replace record id and name with the filename (`bool`). + """ + record = SeqIO.read(genbank, "genbank") + if use_file_name_as_id: + record.name = os.path.splitext(os.path.basename(genbank))[0] + record.id = record.name + + return record + + @staticmethod + def write_genbank(record, file_name): + """Write SeqRecord to a Genbank file""" + SeqIO.write(record, file_name, "gb") + + def write_all_records(self, dir_name): + """Write original record and all variants into a directory""" + extension = ".gb" # standard file ext for GenBank files + os.mkdir(dir_name) + # ORIGINAL REFERENCE RECORD + ref_path = os.path.join(dir_name, self.reference.id + extension) + self.write_genbank(self.reference, ref_path) + # VARIANTS + for variant in self.variant_records: + variant_path = os.path.join(dir_name, variant.id) + self.write_genbank(variant, variant_path + extension) + + def DelN(self, bases=1): + """Simulate N base deletion""" + positions = self.get_random_pos(self.reference, n=self.library_size) + for i in range(self.library_size): + position = positions[i] + modified_sequence, position, deleted_sequence = self.subtract_bases( + self.reference.seq, position, bases + ) + if len(deleted_sequence) == 1: # show the letter if there's only one + suffix = str(deleted_sequence) + else: + suffix = str(len(deleted_sequence)) + # We append the original name according to nomenclature: + variant_name = self.reference.id + "_" + str(position) + "D" + suffix + variant_record = SeqRecord( + Seq(modified_sequence), + id=variant_name, + name=variant_name, + annotations={"molecule_type": "DNA", "topology": "circular"}, + ) + label = "@mutator(del)" + description = ( + "Deletion in position " + + str(position) + + " of " + + str(bases) + + " bases (" + + str(deleted_sequence) + + ")" + ) + feature = SeqFeature( + FeatureLocation(position, position), + type="misc_feature", + id="@mutator", + qualifiers={"label": label, "note": description}, + ) + variant_record.features.append(feature) + + self.variant_records.append(variant_record) diff --git a/dna_mutator/__init__.py b/dna_mutator/__init__.py new file mode 100644 index 0000000..e2aa00b --- /dev/null +++ b/dna_mutator/__init__.py @@ -0,0 +1 @@ +from .Mutator import Mutator diff --git a/dna_mutator/version.py b/dna_mutator/version.py new file mode 100644 index 0000000..3dc1f76 --- /dev/null +++ b/dna_mutator/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/images/egf.png b/images/egf.png new file mode 100644 index 0000000..6e0f67e Binary files /dev/null and b/images/egf.png differ diff --git a/pypi-readme.rst b/pypi-readme.rst new file mode 100644 index 0000000..db4e3d9 --- /dev/null +++ b/pypi-readme.rst @@ -0,0 +1,33 @@ +DNA Mutator +============================= + +Creating variants of DNA sequences + + +**Install:** + +.. code:: bash + + pip install dna_mutator + + +**Web documentation:** + +`