Skip to content

Commit

Permalink
update quantification strategies, set reasonable defaults
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewprzh committed Apr 25, 2024
1 parent 72b96ba commit 76ae33b
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 61 deletions.
24 changes: 12 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -460,18 +460,18 @@ where `FILE` is the file name, `READ_COL` is column with read ids (0 if not set)

#### Quantification

`--transcript_quantification` Transcript quantification strategy, should be one of:

* `unique_only` - only reads that are uniquely assigned and consistent with a transcript are used for quantification;
* `with_ambiguous` - ambiguously assigned reads are split between transcripts with equal weights (e.g. 1/2 when a read is assigned to 2 transcripts simultaneously, default mode);
* `with_inconsistent` - uniquely assigned reads with non-intronic inconsistencies (i.e. alternative poly-A site, TSS etc) are also included;
* `all` - all of the above.

`--gene_quantification` Gene quantification strategy, should be one of:

* `unique_only` - only reads that are uniquely assigned to a gene and consistent with any of gene's isoforms are used for quantification;
* `with_ambiguous` - ambiguously assigned reads are split between genes with equal weights (e.g. 1/2 when a read is assigned to 2 genes simultaneously);
* `with_inconsistent` - only reads that are uniquely assigned to a gene but are not necessarily consistent with genes isoforms (default);
`--transcript_quantification` Transcript quantification strategy;
`--gene_quantification` Gene quantification strategy;

Available options for quantification:

* `unique_only` - use only reads that are uniquely assigned and consistent with a transcript/gene
(i.e. flagged as unique/unique_minor_difference), default fot transcript quantification;
* `with_ambiguous` - in addition to unique reads, ambiguously assigned consistent reads are split between features with equal weights
(e.g. 1/2 when a read is assigned to 2 features simultaneously);
* `unique_splicing_consistent` - uses uniquely assigned reads that do not contradict annotated splice sites
(i.e. flagged as unique/unique_minor_difference or inconsistent_non_intronic), default for gene quantification;
* `unique_inconsistent` - uses uniquely assigned reads allowing any kind of inconsistency;
* `all` - all of the above.


Expand Down
73 changes: 41 additions & 32 deletions isoquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
from src.graph_based_model_construction import StrandnessReportingLevel
from src.long_read_assigner import AmbiguityResolvingMethod
from src.long_read_counter import COUNTING_STRATEGIES
from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy
from src.input_data_storage import InputDataStorage
from src.multimap_resolver import MultimapResolvingStrategy
from src.stats import combine_counts
Expand All @@ -56,6 +56,7 @@ def parse_args(cmd_args=None, namespace=None):
input_args_group = parser.add_argument_group('Input data')
output_args_group = parser.add_argument_group('Output naming')
pipeline_args_group = parser.add_argument_group('Pipeline options')
algo_args_group = parser.add_argument_group('Algorithm settings')

other_options = parser.add_argument_group("Additional options:")
show_full_help = '--full_help' in cmd_args
Expand Down Expand Up @@ -132,37 +133,40 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help
help="reads represent FL transcripts; both ends of the read are considered to be reliable")

# ALGORITHM
add_additional_option("--delta", type=int, default=None,
help="delta for inexact splice junction comparison, chosen automatically based on data type")
add_hidden_option("--graph_clustering_distance", type=int, default=None,
help="intron graph clustering distance, "
"splice junctions less that this number of bp apart will not be differentiated")
add_additional_option("--matching_strategy", choices=["exact", "precise", "default", "loose"],
help="read-to-isoform matching strategy from the most strict to least", type=str, default=None)
add_additional_option("--splice_correction_strategy", choices=["none", "default_pacbio", "default_ont", "conservative_ont", "all", "assembly"],
help="read alignment correction strategy to use",
type=str, default=None)
add_additional_option("--model_construction_strategy", choices=["reliable", "default_pacbio",
"sensitive_pacbio", "fl_pacbio", "default_ont",
"sensitive_ont", "all", "assembly"],
help="transcript model construction strategy to use",
type=str, default=None)

add_additional_option("--transcript_quantification", choices=COUNTING_STRATEGIES,
help="transcript quantification strategy", type=str, default="with_ambiguous")
add_additional_option("--gene_quantification", choices=COUNTING_STRATEGIES,
help="gene quantification strategy", type=str, default="with_inconsistent")
add_additional_option("--report_novel_unspliced", "-u", type=bool_str,
help="report novel monoexonic transcripts (true/false), "
"default: false for ONT, true for other data types")
add_additional_option("--report_canonical", type=str, choices=[e.name for e in StrandnessReportingLevel],
help="reporting level for novel transcripts based on canonical splice sites;"
" default: " + StrandnessReportingLevel.auto.name,
default=StrandnessReportingLevel.only_stranded.name)
add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
help="require polyA tails to be present when reporting transcripts; "
"default: auto (requires polyA only when polyA percentage is >= 70%%)",
default=PolyAUsageStrategies.auto.name)
add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str,
help="report novel monoexonic transcripts (true/false), "
"default: false for ONT, true for other data types")
add_additional_option_to_group(algo_args_group, "--report_canonical", type=str,
choices=[e.name for e in StrandnessReportingLevel],
help="reporting level for novel transcripts based on canonical splice sites;"
" default: " + StrandnessReportingLevel.auto.name,
default=StrandnessReportingLevel.only_stranded.name)
add_additional_option_to_group(algo_args_group, "--polya_requirement", type=str,
choices=[e.name for e in PolyAUsageStrategies],
help="require polyA tails to be present when reporting transcripts; "
"default: auto (requires polyA only when polyA percentage is >= 70%%)",
default=PolyAUsageStrategies.auto.name)

add_additional_option_to_group(algo_args_group, "--transcript_quantification", choices=COUNTING_STRATEGIES,
help="transcript quantification strategy", type=str,
default=CountingStrategy.unique_only.name)
add_additional_option_to_group(algo_args_group, "--gene_quantification", choices=COUNTING_STRATEGIES,
help="gene quantification strategy", type=str,
default=CountingStrategy.unique_splicing_consistent.name)

add_additional_option_to_group(algo_args_group, "--matching_strategy",
choices=["exact", "precise", "default", "loose"],
help="read-to-isoform matching strategy from the most strict to least",
type=str, default=None)
add_additional_option_to_group(algo_args_group, "--splice_correction_strategy",
choices=["none", "default_pacbio", "default_ont",
"conservative_ont", "all", "assembly"],
help="read alignment correction strategy to use", type=str, default=None)
add_additional_option_to_group(algo_args_group, "--model_construction_strategy",
choices=["reliable", "default_pacbio", "sensitive_pacbio", "fl_pacbio",
"default_ont", "sensitive_ont", "all", "assembly"],
help="transcript model construction strategy to use", type=str, default=None)

# OUTPUT PROPERTIES
pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
default="16")
Expand All @@ -188,6 +192,11 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help
help="align reads to reference without running further analysis")

# ADDITIONAL
add_additional_option("--delta", type=int, default=None,
help="delta for inexact splice junction comparison, chosen automatically based on data type")
add_hidden_option("--graph_clustering_distance", type=int, default=None,
help="intron graph clustering distance, "
"splice junctions less that this number of bp apart will not be differentiated")
add_additional_option("--no_gzip", help="do not gzip large output files", dest="gzipped",
action='store_false', default=True)
add_additional_option("--no_gtf_check", help="do not perform GTF checks", dest="gtf_check",
Expand Down
33 changes: 16 additions & 17 deletions src/long_read_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,29 @@
class CountingStrategy(Enum):
unique_only = 1
with_ambiguous = 2
with_inconsistent = 10
with_inconsistent_minor = 11
unique_splicing_consistent = 10
unique_inconsistent = 11
# all_splicing_consistent = 19
all = 20
all_minor = 21

def no_inconsistent(self):
return self in [CountingStrategy.unique_only, CountingStrategy.with_ambiguous]

def ambiguous(self):
return self in [CountingStrategy.all, CountingStrategy.all_minor, CountingStrategy.with_ambiguous]
return self in [CountingStrategy.all, CountingStrategy.with_ambiguous]

def inconsistent_minor(self):
return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor]
return self == CountingStrategy.unique_splicing_consistent

def inconsistent(self):
return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor,
CountingStrategy.with_inconsistent, CountingStrategy.all]
return self in [CountingStrategy.unique_splicing_consistent,
CountingStrategy.unique_inconsistent, CountingStrategy.all]


COUNTING_STRATEGIES = [CountingStrategy.unique_only.name,
CountingStrategy.with_ambiguous.name,
CountingStrategy.with_inconsistent.name,
CountingStrategy.unique_splicing_consistent.name,
CountingStrategy.unique_inconsistent.name,
CountingStrategy.all.name]


Expand Down Expand Up @@ -94,19 +95,17 @@ def confirms_feature(read_assignment):


class ReadWeightCounter:
def __init__(self, strategy_str, relax_strategy=True):
def __init__(self, strategy_str):
self.strategy = CountingStrategy[strategy_str]
if relax_strategy:
if self.strategy == CountingStrategy.with_inconsistent:
self.strategy = CountingStrategy.with_inconsistent_minor
elif self.strategy == CountingStrategy.all:
self.strategy = CountingStrategy.all_minor
self.strategy_flags = CountingStrategyFlags(self.strategy)

def process_inconsistent(self, assignment_type, feature_count):
# use only for inconsistent assignments
if assignment_type == ReadAssignmentType.inconsistent_ambiguous or feature_count > 1:
return 0.0
if self.strategy_flags.use_ambiguous:
return 1.0 / feature_count
else:
return 0.0
if self.strategy_flags.use_inconsistent:
return 1.0
if self.strategy_flags.use_inconsistent_minor and assignment_type == ReadAssignmentType.inconsistent_non_intronic:
Expand Down Expand Up @@ -365,15 +364,15 @@ def convert_counts_to_tpm(self):

def create_gene_counter(output_file_name, strategy, complete_feature_list=None,
read_groups=None, ignore_read_groups=False, output_zeroes=True):
read_weight_counter = ReadWeightCounter(strategy, relax_strategy=False)
read_weight_counter = ReadWeightCounter(strategy)
return AssignedFeatureCounter(output_file_name, GeneAssignmentExtractor,
read_groups, read_weight_counter,
complete_feature_list, ignore_read_groups, output_zeroes)


def create_transcript_counter(output_file_name, strategy, complete_feature_list=None,
read_groups=None, ignore_read_groups=False, output_zeroes=True):
read_weight_counter = ReadWeightCounter(strategy, relax_strategy=True)
read_weight_counter = ReadWeightCounter(strategy)
return AssignedFeatureCounter(output_file_name, TranscriptAssignmentExtractor,
read_groups, read_weight_counter,
complete_feature_list, ignore_read_groups, output_zeroes)
Expand Down

0 comments on commit 76ae33b

Please sign in to comment.