Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to compute distances from gene TES only ('--edge=tes') #77

Merged
merged 4 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,18 @@ example:
.. image:: nearest_tss.png
:align: center

Alternatively distances can be calculated as the shortest distance
between either of the peak edges to either the TSS or the TES of
the gene, by specifying the ``--edge=both`` option::
(This behaviour can be made explicit by specifying the
``--edge=tss`` option.)

Alternatively distances can be calculated as the distance from
the gene TES (by specifying the ``--edge=tes``), or as the shortest
distance between either of the peak edges to whichever of the TSS or
the TES of the gene is closest (by specifying the ``--edge=both``
option).

For example:

::

RnaChipIntegrator --edge=both GENES PEAKS

Expand Down
72 changes: 57 additions & 15 deletions rnachipintegrator/analysis.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/bin/env python
#
# analysis.py: analyses of peaks vs features and vice versa
# Copyright (C) University of Manchester 2011-2019 Peter Briggs, Leo Zeef
# & Ian Donaldson
# Copyright (C) University of Manchester 2011-2019,2024 Peter Briggs,
# Leo Zeef & Ian Donaldson
#
"""
analysis.py
Expand All @@ -29,7 +29,7 @@
# distances from an arbitrary zero, and then calculating distances
# between things with reference to that?

def find_nearest_features(peaks,features,distance=None,tss_only=False,
def find_nearest_features(peaks,features,distance=None,use_edge='BOTH',
only_differentially_expressed=False):
"""
Locate nearest features for each peak
Expand All @@ -38,9 +38,10 @@ def find_nearest_features(peaks,features,distance=None,tss_only=False,
features (FeatureList): list of features
peaks (PeakList): list of peaks
distance (int): optional cut-off distance to apply
tss_only (bool): only consider distances from the
feature TSS (default is to consider distances from
both the TSS and TES)
use_edge (str): one of 'BOTH' (the default), 'TSS'
or 'TSS'. 'Both' considers distances from both the
feature TSS and TES and picks the nearest, whereas
'tss' and 'tes' only consider the specified edge.
only_differentially_expressed (bool): only consider
features that are flagged as differentially expressed
pad (bool): add extra 'None' items to output
Expand All @@ -58,17 +59,22 @@ def find_nearest_features(peaks,features,distance=None,tss_only=False,
# Differentially-expressed features only?
if only_differentially_expressed:
feature_list = feature_list.filterByFlag(1)
if tss_only:
if use_edge == "TSS":
sort_features_by_tss_distances(peak,feature_list)
elif use_edge == "TES":
sort_features_by_tes_distances(peak,feature_list)
else:
sort_features_by_edge_distances(peak,feature_list)
# Apply distance cut-off
if distance is not None:
closest = FeatureSet()
for feature in feature_list:
if tss_only:
if use_edge == "TSS":
if distances.distance_tss(peak,feature) > distance:
break
elif use_edge == "TES":
if distances.distance_tes(peak,feature) > distance:
break
else:
if distances.distance_closest_edge(peak,feature) > distance:
break
Expand All @@ -80,7 +86,7 @@ def find_nearest_features(peaks,features,distance=None,tss_only=False,
# Return result
yield (peak,feature_list)

def find_nearest_peaks(features,peaks,distance=None,tss_only=False,
def find_nearest_peaks(features,peaks,distance=None,use_edge='BOTH',
only_differentially_expressed=False):
"""
Locate nearest peaks for each feature
Expand All @@ -89,9 +95,10 @@ def find_nearest_peaks(features,peaks,distance=None,tss_only=False,
features (FeatureList): list of features
peaks (PeakList): list of peaks
distance (int): optional cut-off distance to apply
tss_only (bool): only consider distances from the
feature TSS (default is to consider distances from
both the TSS and TES)
use_edge (str): one of 'BOTH' (the default), 'TSS'
or 'TSS'. 'Both' considers distances from both the
feature TSS and TES and picks the nearest, whereas
'tss' and 'tes' only consider the specified edge.
only_differentially_expressed (bool): only consider
features that are flagged as differentially expressed

Expand All @@ -102,9 +109,12 @@ def find_nearest_peaks(features,peaks,distance=None,tss_only=False,

"""
# Set functions according to edges
if tss_only:
if use_edge == "TSS":
sort_peaks = sort_peaks_by_tss_distances
get_distance = distances.distance_tss
elif use_edge == "TES":
sort_peaks = sort_peaks_by_tes_distances
get_distance = distances.distance_tes
else:
sort_peaks = sort_peaks_by_edge_distances
get_distance = distances.distance_closest_edge
Expand Down Expand Up @@ -145,7 +155,7 @@ def sort_features_by_edge_distances(peak,features):
"""
features.features = sorted(features.features,
key = lambda f:
distances.edge_distances(peak,f))
(distances.edge_distances(peak,f),f.id))

def sort_features_by_tss_distances(peak,features):
"""
Expand All @@ -161,7 +171,23 @@ def sort_features_by_tss_distances(peak,features):
"""
features.features = sorted(features.features,
key = lambda f:
distances.tss_distances(peak,f))
(distances.tss_distances(peak,f),f.id))

def sort_features_by_tes_distances(peak,features):
"""
Sort features by TES-to-edge distances to a peak

Arguments:
peak (Peak): peak instance
features (FeatureSet): set of features that will
be sorted into order according to the smallest
distance of their TES positions to the edges of
the peak. The sorting is done in place.

"""
features.features = sorted(features.features,
key = lambda f:
(distances.tes_distances(peak,f),f.id))

def sort_peaks_by_edge_distances(feature,peaks):
"""
Expand Down Expand Up @@ -194,3 +220,19 @@ def sort_peaks_by_tss_distances(feature,peaks):
peaks.peaks = sorted(peaks.peaks,
key = lambda p:
distances.tss_distances(p,feature))

def sort_peaks_by_tes_distances(feature,peaks):
"""
Sort peaks by edge-to-TES distances to a feature

Arguments:
feature (Feature): feature instance
peaks (PeakSet): set of peaks that will be
sorted into order according to the smallest
distance of their edges from the TES position
of the feature. The sorting is done in place.

"""
peaks.peaks = sorted(peaks.peaks,
key = lambda p:
distances.tes_distances(p,feature))
40 changes: 22 additions & 18 deletions rnachipintegrator/cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env python
#
# RnaChipIntegrator.py: analyse genomic features (genes) with peak data
# Copyright (C) University of Manchester 2011-19 Peter Briggs, Leo Zeef
# & Ian Donaldson
# Copyright (C) University of Manchester 2011-19,2024 Peter Briggs,
# Leo Zeef & Ian Donaldson
#
# This code is free software; you can redistribute it and/or modify it
# under the terms of the Artistic License 2.0 (see the file LICENSE
Expand Down Expand Up @@ -207,13 +207,13 @@ def add_edge_option(self,group=None):
"""
self.add_option('--edge',
action='store',dest="edge",
choices=('tss','both'),
choices=('tss','tes','both'),
default='tss',
help="Gene edges to consider when calculating "
"distances between genes and peaks, either: "
"'tss' (default: only use gene TSS) or 'both' "
"(use whichever of TSS or TES gives shortest "
"distance)",
"'tss' (default: only use gene TSS), 'tes' "
"(only use gene TES), or 'both' (use whichever "
"of TSS or TES gives shortest distance)",
group=group)

def add_only_de_option(self,group=None):
Expand Down Expand Up @@ -497,7 +497,8 @@ class AnalysisParams(object):
genes: FeatureSet instance
peaks: PeakSet instance
cutoff: cutoff distance
tss_only: whether to use TSS or both TSS and TES
use_edge: whether to use both TSS and TES, TSS only
or TES only
only_differentially_expressed: whether to restrict
genes to those flagged as differenitally
expressed
Expand All @@ -514,7 +515,7 @@ def __init__(self,**kws):
genes=None,
peaks=None,
cutoff=None,
tss_only=False,
use_edge="BOTH",
only_differentially_expressed=False)
for key in kws:
if key not in self._params:
Expand All @@ -531,8 +532,8 @@ def peaks(self):
def cutoff(self):
return self._params['cutoff']
@property
def tss_only(self):
return self._params['tss_only']
def use_edge(self):
return self._params['use_edge']
@property
def only_differentially_expressed(self):
return self._params['only_differentially_expressed']
Expand Down Expand Up @@ -675,7 +676,7 @@ def find_nearest_features(params):
for peak,genes in analysis.find_nearest_features(
params.peaks,
params.genes,
tss_only=params.tss_only,
use_edge=params.use_edge,
distance=params.cutoff,
only_differentially_expressed=\
params.only_differentially_expressed):
Expand All @@ -695,7 +696,7 @@ def find_nearest_peaks(params):
for gene,peaks in analysis.find_nearest_peaks(
params.genes,
params.peaks,
tss_only=params.tss_only,
use_edge=params.use_edge,
distance=params.cutoff,
only_differentially_expressed=\
params.only_differentially_expressed):
Expand Down Expand Up @@ -853,9 +854,11 @@ def main(args=None):

# Gene edge to use
if options.edge == 'tss':
tss_only = True
use_edge = "TSS"
elif options.edge == 'tes':
use_edge = "TES"
else:
tss_only = False
use_edge = "BOTH"

# Promoter region
promoter = (abs(int(options.promoter_region.split(',')[0])),
Expand Down Expand Up @@ -960,8 +963,7 @@ def main(args=None):
print("Cutoffs (bp) : %s" % ','.join([str(d) if d is not None
else "no cutoff"
for d in cutoffs]))
print("Edge : %s" % ('TSS only' if tss_only
else 'TSS or TES'))
print("Edge : %s" % use_edge)
print("DE only : %s" % ('yes' if options.only_diff_expressed
else 'no'))
print("Nprocs : %s" % options.nprocs)
Expand Down Expand Up @@ -1021,7 +1023,7 @@ def main(args=None):
genes=gene_lists[genes],
peaks=peak_lists[peaks],
cutoff=cutoff,
tss_only=tss_only,
use_edge=use_edge,
only_differentially_expressed=
options.only_diff_expressed
))
Expand Down Expand Up @@ -1182,8 +1184,10 @@ def main(args=None):
else "%d" % options.max_closest))
xlsx.append_to_notes("Promoter region (bp from TSS)\t-%d to %d" %
promoter)
if tss_only:
if use_edge == "TSS":
xlsx.append_to_notes("Distances calculated from\tTSS only")
elif use_edge == "TES":
xlsx.append_to_notes("Distances calculated from\tTES only")
else:
xlsx.append_to_notes("Distances calculated from\tTSS or TES")
xlsx.append_to_notes("Only use differentially expressed %ss\t%s" %
Expand Down
38 changes: 36 additions & 2 deletions rnachipintegrator/distances.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/bin/env python
#
# distances.py: functions for determining distances and overlaps
# Copyright (C) University of Manchester 2011-2019 Peter Briggs, Leo Zeef
# & Ian Donaldson
# Copyright (C) University of Manchester 2011-2019,2024 Peter Briggs,
# Leo Zeef & Ian Donaldson
#
"""
distances.py
Expand Down Expand Up @@ -218,6 +218,40 @@ def tss_distances(peak,feature):
d_tss.sort()
return tuple(d_tss)

def tes_distances(peak,feature):
"""
Return distances between peak edges and feature TES

Arguments:
peak (Peak): peak instance
feature (Feature): feature instance

Returns:
tuple: a pair of distances, the first being the
smallest distance between an edge of the peak
region to the TES of the feature region, and
the second being the distance from the other peak
edge to the TES.
If the TES position is entirely contained within
the peak then the smallest distance is returned as
zero, the second is the average of the distances
to the two peak edges from the TES.

"""
# TES is considered as a point
if (feature.tes >= peak.start and
feature.tes <= peak.end):
# TES entirely contained in the peak
# Rank using the average distance of the peak
# edges from the TES
return (0,(abs(feature.tes - peak.start) +
abs(feature.tes - peak.end))/2)
# Possible distances to TES
d_tes = [abs(feature.tes - peak.start),
abs(feature.tes - peak.end)]
d_tes.sort()
return tuple(d_tes)

def distance_closest_edge(peak,feature):
"""
Get distance from a peak to a feature
Expand Down
Loading
Loading