Skip to content

Commit

Permalink
Merge pull request #27 from arvestad/select_subalignment
Browse files Browse the repository at this point in the history
Support for selecting and viewing subalignments
  • Loading branch information
arvestad authored Mar 2, 2024
2 parents a6fc1ac + 7894890 commit e8fff8f
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 40 deletions.
13 changes: 13 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,42 +1,55 @@
# Changes since last version


## v1.8.0

* Added support for selecting sub-alignments. See option `-sa`


## v1.7.2

* Ed Davies fixed a bug: sorting using option -so was broken


## v1.7.1

* What??


## v1.7.0

* Added the option --alignment-index to support files with multiple MSAs.
* Added two functions, view and glimpse, to enable use of alv in notebook environments.


## v1.6.1

* Fixed accession abbreviation so that short accessions are left as they are.
* Requiring python 3.5 or later, because Colorama has droppen support for 3.4.


## v1.6.0

* New options: --only-variable and --only-variable-excluding-indels, contributed by nikostr. Constrains coloring
to columns with variation and variation not counting indels.
* Fixed the --dotted option, which only worked with the first block for DNA sequences. Also improved the coloring
which was too ugly in dotted mode (due to laziness).


## v1.5.0

* New option: `-d` or `--dotted`: the first sequence in the output alignment is used as a template and for positions
in subsequent sequences that are identical, a period ('.') is output instead of a symbol.
* Adjustment: replacing blue with cyan in the DNA coloring scheme.


## v1.4.0

* New option: `-r k` or `--random-accessions k` for only showing a sample of _k_ sequences.
* New option: `-g` or `--glimpse`: display an informative cut-out of the input MSA, if it does
not fit without scrolling or line-breaking.


## v1.3.4

* For some reason, setup.py was/is not putting proper Markdown to PyPi. Solved it halfway. Weird issue.
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,14 @@ alv -k msa.fa | less -R
The `-k` option ensures that `alv` keeps coloring the alignment (by default, piping
and redirection removes colors), and the `-R` option instructs `less` to interpret color codes.


Choose to view a sub-alignment:
```
alv -sa 30 60 msa.fa
```
This selects and views columns 30 to 59 of msa.fa, keeping track of the "original" columns indexes in the output.


## For developers

Run `python setup.py develop test` for development install and to execute tests.
Expand Down
3 changes: 2 additions & 1 deletion THANKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@
completely without "codons".
* Michael Milton suggested the --alignment-index option.
* Marina Herrera Sarrias suggested the notebook functionality.
* Ed Davies fixed a bug: sorting using option -so was broken
* Ed Davies fixed a bug: sorting using option -so was broken.
* Artem Kushner pointed out the missing support for sub-alignments.
57 changes: 30 additions & 27 deletions alv/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,14 @@ class BaseAlignment:
to access all alignments, and implementations for the subclasses
with single-letter column widths.
'''
def __init__(self, alignment):
def __init__(self, alignment, start=0, end=-1):
self.al = alignment # Holder of the BioPython alignment object
self.al_length = alignment.get_alignment_length()
self.type = None
self._start = start
self._end = alignment.get_alignment_length()
if end != -1:
self._end = min(end, self._end)
self.al_length = self._end - self._start
self.column_width = 1
self._update_seq_index()
self.columns = self._summarize_columns()
Expand Down Expand Up @@ -121,13 +125,12 @@ def _compute_block_width(self, terminal_width, al_width, left_margin):
return terminal_width - left_margin - sacrifice

def blocks(self, block_width):
al_width = self.al_length
if al_width == 0:
if self.al_length == 0:
raise AlvEmptyAlignment()
else:
for start in range(0, al_width, block_width):
end = min(al_width, start + block_width)
yield AlignmentBlock(start, end)
for block_start in range(self._start, self._end, block_width):
block_end = min(self._end, block_start + block_width)
yield AlignmentBlock(block_start, block_end)


def apply_painter(self, acc, block, painter):
Expand Down Expand Up @@ -167,9 +170,9 @@ def _summarize_columns(self):
'''
Count the different elements in each column.
'''
columns = []
for col_no in range(self.al_length):
columns.append(Counter(self.al[:, col_no]))
columns = {}
for col_no in range(self._start, self._end):
columns[col_no] = Counter(self.al[:, col_no])
return columns

def get_basic_info(self):
Expand All @@ -188,7 +191,7 @@ def get_column_conservation(self):
'''
result = [] # For the return value
column_summaries = self._summarize_columns()
for c in column_summaries:
for idx, c in column_summaries.items():
for indel in self.indels:
del c[indel]
if len(c) == 0:
Expand Down Expand Up @@ -222,24 +225,24 @@ def get_conserved_block(self, n_columns):


class AminoAcidAlignment(BaseAlignment):
def __init__(self, alignment):
super().__init__(alignment)
def __init__(self, alignment, start=0, end=-1):
super().__init__(alignment, start, end)
self.type = 'aa'

class DnaAlignment(BaseAlignment):
def __init__(self, alignment):
super().__init__(alignment)
def __init__(self, alignment, start=0, end=-1):
super().__init__(alignment, start, end)
self.type = 'dna'

class CodonAlignment(BaseAlignment):
'''
Alignment of coding DNA. A column has a width of three nucleotides.
'''
def __init__(self, alignment):
def __init__(self, alignment, start=0, end=-1):
self.type = 'codon'
self.column_width = 3
self.genetic_code = 1 # The standard code
super().__init__(alignment)
super().__init__(alignment, start, end)
self.basic_info['Genetic code'] = self.genetic_code

def block_width(self, terminal_width, args):
Expand All @@ -261,9 +264,9 @@ def apply_painter(self, acc, block, painter):
seq = str(seq_record.seq)
colored_seq = ''

for codon_col, pos in enumerate(range(0, len(seq), 3)):
c = seq[pos:pos+3]
colored_seq += painter.colorizer(c, self.columns[block.start // 3 + codon_col])
for codon_col, pos in enumerate(range(block.start, block.end, 3)):
c = seq[codon_col:codon_col+3]
colored_seq += painter.colorizer(c, self.columns[pos])
return painter.sol() + colored_seq + painter.eol()

def apply_dotter(self, acc, block, painter, template_acc):
Expand All @@ -281,12 +284,12 @@ def apply_dotter(self, acc, block, painter, template_acc):
template_seq = template_record.seq

colored_seq = ''
for codon_col_no, pos in enumerate(range(0, len(seq), 3)):
for codon_col, pos in enumerate(range(block.start, block.end, 3)):
c = seq[pos:pos+3]
if c == template_seq[pos:pos+3]:
colored_seq += '...'
else:
colored_seq += painter.colorizer(c, self.columns[block.start // 3 + codon_col_no])
colored_seq += painter.colorizer(c, self.columns[pos])
return painter.sol() + colored_seq + painter.eol()


Expand All @@ -296,11 +299,11 @@ def _summarize_columns(self):
Specialization of base method for codon columns. Do not focus on the amino acids, but look at
amino acid columns.
'''
columns = []
for pos in range(0, self.al_length, 3):
columns = {}
for pos in range(self._start, self._end, 3):
codon_column = map(lambda r: str(r.seq), self.al[:, pos:pos+3])
aa_column = map(lambda codon: self._translate(codon), codon_column)
columns.append(Counter(aa_column))
columns[pos] = Counter(aa_column)
return columns


Expand All @@ -310,7 +313,7 @@ def get_conserved_block(self, n_columns):
Used for alignment glimpses.
Specialised for codon alignments.
'''
conservation = self.get_column_conservation() # A list of conservation scores. Want a maximised "window"
conservation = self.get_column_conservation() # A dict of conservation scores. Want a maximised "window"
accumulated_conservation = []
acc = 0
for c in conservation:
Expand All @@ -321,7 +324,7 @@ def get_conserved_block(self, n_columns):

codon_al_width = math.floor(self.al_width() / 3)
if codon_al_width <= n_columns:
return AlignmentBlock(0, codon_al_width)
return AlignmentBlock(0, self._end)
else:
best_start = max(range(0, codon_al_width - n_columns),
key=lambda i: accumulated_conservation[i+n_columns] - accumulated_conservation[i])
Expand Down
1 change: 0 additions & 1 deletion alv/alignmentterminal.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import shutil



class AlignmentTerminal:
'''
This class encapsulates knowledge about the terminal and how to draw on it.
Expand Down
14 changes: 8 additions & 6 deletions alv/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ def guess_format(filename):
return 'phylip'


def read_alignment(file, seqtype, input_format, color_scheme, genetic_code, alignment_no=0):
def read_alignment(file, seqtype, input_format, color_scheme, genetic_code, al_start=0, al_end=-1, alignment_no=0):
'''
Factory function. Read the alignment with BioPython's support, and
return an appropriate alv alignment.
al_start and al_end: restrict the alignment to a subalignment defined by these indices
'''
if file == '-':
file = sys.stdin # Start reading from stdin if "magic filename"
Expand All @@ -50,11 +52,11 @@ def read_alignment(file, seqtype, input_format, color_scheme, genetic_code, alig
n_msas = len(alignments)
if alignment_no >= n_msas:
raise IOError(f'Alignment index too large, only {n_msas} alignment(s) in the file.')
return get_alv_objects(alignments[alignment_no], seqtype, color_scheme, genetic_code)
return get_alv_objects(alignments[alignment_no], seqtype, color_scheme, genetic_code, al_start, al_end)
else:
raise ValueError('No alignment in input')

def get_alv_objects(alignment, seqtype, color_scheme, genetic_code):
def get_alv_objects(alignment, seqtype, color_scheme, genetic_code, al_start, al_end):
'''
Take the alignment object and return a suitable Alv alignment object, with respect
to sequence type, and colorization object ("painter").
Expand All @@ -70,13 +72,13 @@ def get_alv_objects(alignment, seqtype, color_scheme, genetic_code):
painter = AminoAcidPainter()

if seqtype == 'aa':
return AminoAcidAlignment(alignment), painter
return AminoAcidAlignment(alignment, al_start, al_end), painter
if seqtype == 'dna' or seqtype == 'rna':
return DnaAlignment(alignment), DnaPainter()
return DnaAlignment(alignment, al_start, al_end), DnaPainter()
elif seqtype == 'codon':
al = CodonAlignment(alignment)
al.set_genetic_code(genetic_code)
return CodonAlignment(alignment), CodonPainter(painter)
return CodonAlignment(alignment, al_start, al_end), CodonPainter(painter)
else:
raise IOError(f'Unknown sequence type: "{seqtype}"')

Expand Down
2 changes: 1 addition & 1 deletion alv/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.7.2'
__version__ = '1.8.0'
11 changes: 9 additions & 2 deletions bin/alv
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import alv
import argparse
import os
import sys
from traceback import print_last
from traceback import print_exc

from alv.version import __version__
from alv.alignmentterminal import AlignmentShellTerminal
Expand Down Expand Up @@ -110,6 +110,8 @@ def setup_argument_parsing():
help='Comma-separated list of accessions. Sequences will be presented in this order. Also note that one can choose which sequences to present with this opion. Overrides -s and -si.')
ordering_args.add_argument('-sm', '--select-matching', metavar='ACCESSION_PATTERN', type=str,
help='Only show sequences with accessions containing ACCESSION_PATTERN.')
ordering_args.add_argument('-sa', '--sub-alignment', nargs=2, metavar='INT', type=int,
help='Only show alignment columns given by FROM and UPTO indices.')

# Options for limiting colorization
restriction_args = ap.add_argument_group('Restricting colorization')
Expand Down Expand Up @@ -147,7 +149,11 @@ def input_and_option_adaption(args):
format = 'fasta' # Hard guess, because a bit complicated when reading from pipe (sys.stdin)
else:
format = args.format
alignment, painter = io.read_alignment(args.infile, args.type, format, args.color_scheme, args.code, args.alignment_index)
if args.sub_alignment: # Are we restricting the alignment?
start_col, end_col = args.sub_alignment
else:
start_col, end_col = 0, -1
alignment, painter = io.read_alignment(args.infile, args.type, format, args.color_scheme, args.code, start_col, end_col, args.alignment_index)
return alignment, painter

except KeyboardInterrupt:
Expand Down Expand Up @@ -253,6 +259,7 @@ def main():
print('alv:', e, file=sys.stderr)
ap.exit(3)
except Exception as e:
print_exc()
print('Alv bug! Please report!', file=sys.stderr)
print(e, file=sys.stderr)
ap.exit(2)
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ def read_version_string(filename):
'biopython>=1.70',
'colorama>=0.3.8',
],
classifiers=(
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Topic :: Scientific/Engineering :: Bio-Informatics",
),
],
)

0 comments on commit e8fff8f

Please sign in to comment.