update quantification strategies, set reasonable defaults

ablab · Apr 25, 2024 · 76ae33b · 76ae33b
1 parent 72b96ba
commit 76ae33b
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 61 deletions.
diff --git a/README.md b/README.md
@@ -460,18 +460,18 @@ where `FILE` is the file name, `READ_COL` is column with read ids (0 if not set)
 
 #### Quantification
 
-`--transcript_quantification` Transcript quantification strategy, should be one of:
-
-* `unique_only` - only reads that are uniquely assigned and consistent with a transcript are used for quantification;
-* `with_ambiguous` - ambiguously assigned reads are split between transcripts with equal weights (e.g. 1/2 when a read is assigned to 2 transcripts simultaneously, default mode);
-* `with_inconsistent` - uniquely assigned reads with non-intronic inconsistencies (i.e. alternative poly-A site, TSS etc) are also included;
-* `all` - all of the above.
-
-`--gene_quantification` Gene quantification strategy, should be one of:
-
-* `unique_only` -  only reads that are uniquely assigned to a gene and consistent with any of gene's isoforms are used for quantification;
-* `with_ambiguous` - ambiguously assigned reads are split between genes with equal weights (e.g. 1/2 when a read is assigned to 2 genes simultaneously);
-* `with_inconsistent` - only reads that are uniquely assigned to a gene but are not necessarily consistent with genes isoforms (default);
+`--transcript_quantification` Transcript quantification strategy;
+`--gene_quantification` Gene quantification strategy;
+
+Available options for quantification:
+
+* `unique_only` - use only reads that are uniquely assigned and consistent with a transcript/gene
+(i.e. flagged as unique/unique_minor_difference), default fot transcript quantification;
+* `with_ambiguous` - in addition to unique reads, ambiguously assigned consistent reads are split between features with equal weights 
+(e.g. 1/2 when a read is assigned to 2 features simultaneously);
+* `unique_splicing_consistent` - uses uniquely assigned reads that do not contradict annotated splice sites
+(i.e. flagged as unique/unique_minor_difference or inconsistent_non_intronic), default for gene quantification;
+* `unique_inconsistent` - uses uniquely assigned reads allowing any kind of inconsistency;
 * `all` - all of the above.
 
 

diff --git a/isoquant.py b/isoquant.py
@@ -35,7 +35,7 @@
 from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
 from src.graph_based_model_construction import StrandnessReportingLevel
 from src.long_read_assigner import AmbiguityResolvingMethod
-from src.long_read_counter import COUNTING_STRATEGIES
+from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy
 from src.input_data_storage import InputDataStorage
 from src.multimap_resolver import MultimapResolvingStrategy
 from src.stats import combine_counts
@@ -56,6 +56,7 @@ def parse_args(cmd_args=None, namespace=None):
     input_args_group = parser.add_argument_group('Input data')
     output_args_group = parser.add_argument_group('Output naming')
     pipeline_args_group = parser.add_argument_group('Pipeline options')
+    algo_args_group = parser.add_argument_group('Algorithm settings')
 
     other_options = parser.add_argument_group("Additional options:")
     show_full_help = '--full_help' in cmd_args
@@ -132,37 +133,40 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                         help="reads represent FL transcripts; both ends of the read are considered to be reliable")
 
     # ALGORITHM
-    add_additional_option("--delta", type=int, default=None,
-                          help="delta for inexact splice junction comparison, chosen automatically based on data type")
-    add_hidden_option("--graph_clustering_distance", type=int, default=None,
-                      help="intron graph clustering distance, "
-                           "splice junctions less that this number of bp apart will not be differentiated")
-    add_additional_option("--matching_strategy", choices=["exact", "precise", "default", "loose"],
-                          help="read-to-isoform matching strategy from the most strict to least", type=str, default=None)
-    add_additional_option("--splice_correction_strategy", choices=["none", "default_pacbio", "default_ont", "conservative_ont", "all", "assembly"],
-                          help="read alignment correction strategy to use",
-                          type=str, default=None)
-    add_additional_option("--model_construction_strategy", choices=["reliable", "default_pacbio",
-                                                                    "sensitive_pacbio", "fl_pacbio", "default_ont",
-                                                                    "sensitive_ont", "all", "assembly"],
-                          help="transcript model construction strategy to use",
-                          type=str, default=None)
-
-    add_additional_option("--transcript_quantification", choices=COUNTING_STRATEGIES,
-                          help="transcript quantification strategy", type=str, default="with_ambiguous")
-    add_additional_option("--gene_quantification", choices=COUNTING_STRATEGIES,
-                          help="gene quantification strategy", type=str, default="with_inconsistent")
-    add_additional_option("--report_novel_unspliced", "-u", type=bool_str,
-                          help="report novel monoexonic transcripts (true/false), "
-                               "default: false for ONT, true for other data types")
-    add_additional_option("--report_canonical",  type=str, choices=[e.name for e in StrandnessReportingLevel],
-                          help="reporting level for novel transcripts based on canonical splice sites;"
-                               " default: " + StrandnessReportingLevel.auto.name,
-                          default=StrandnessReportingLevel.only_stranded.name)
-    add_additional_option("--polya_requirement", type=str, choices=[e.name for e in PolyAUsageStrategies],
-                          help="require polyA tails to be present when reporting transcripts; "
-                               "default: auto (requires polyA only when polyA percentage is >= 70%%)",
-                          default=PolyAUsageStrategies.auto.name)
+    add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str,
+                                   help="report novel monoexonic transcripts (true/false), "
+                                        "default: false for ONT, true for other data types")
+    add_additional_option_to_group(algo_args_group, "--report_canonical",  type=str,
+                                   choices=[e.name for e in StrandnessReportingLevel],
+                                   help="reporting level for novel transcripts based on canonical splice sites;"
+                                        " default: " + StrandnessReportingLevel.auto.name,
+                                   default=StrandnessReportingLevel.only_stranded.name)
+    add_additional_option_to_group(algo_args_group, "--polya_requirement", type=str,
+                                   choices=[e.name for e in PolyAUsageStrategies],
+                                   help="require polyA tails to be present when reporting transcripts; "
+                                        "default: auto (requires polyA only when polyA percentage is >= 70%%)",
+                                   default=PolyAUsageStrategies.auto.name)
+
+    add_additional_option_to_group(algo_args_group, "--transcript_quantification", choices=COUNTING_STRATEGIES,
+                                   help="transcript quantification strategy", type=str,
+                                   default=CountingStrategy.unique_only.name)
+    add_additional_option_to_group(algo_args_group, "--gene_quantification", choices=COUNTING_STRATEGIES,
+                                   help="gene quantification strategy", type=str,
+                                   default=CountingStrategy.unique_splicing_consistent.name)
+
+    add_additional_option_to_group(algo_args_group, "--matching_strategy",
+                                   choices=["exact", "precise", "default", "loose"],
+                                   help="read-to-isoform matching strategy from the most strict to least",
+                                   type=str, default=None)
+    add_additional_option_to_group(algo_args_group, "--splice_correction_strategy",
+                                   choices=["none", "default_pacbio", "default_ont",
+                                            "conservative_ont", "all", "assembly"],
+                                   help="read alignment correction strategy to use", type=str, default=None)
+    add_additional_option_to_group(algo_args_group, "--model_construction_strategy",
+                                   choices=["reliable", "default_pacbio", "sensitive_pacbio", "fl_pacbio",
+                                            "default_ont", "sensitive_ont", "all", "assembly"],
+                                   help="transcript model construction strategy to use", type=str, default=None)
+
     # OUTPUT PROPERTIES
     pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
                                      default="16")
@@ -188,6 +192,11 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                                    help="align reads to reference without running further analysis")
 
     # ADDITIONAL
+    add_additional_option("--delta", type=int, default=None,
+                          help="delta for inexact splice junction comparison, chosen automatically based on data type")
+    add_hidden_option("--graph_clustering_distance", type=int, default=None,
+                      help="intron graph clustering distance, "
+                           "splice junctions less that this number of bp apart will not be differentiated")
     add_additional_option("--no_gzip", help="do not gzip large output files", dest="gzipped",
                           action='store_false', default=True)
     add_additional_option("--no_gtf_check", help="do not perform GTF checks", dest="gtf_check",

diff --git a/src/long_read_counter.py b/src/long_read_counter.py
@@ -22,28 +22,29 @@
 class CountingStrategy(Enum):
     unique_only = 1
     with_ambiguous = 2
-    with_inconsistent = 10
-    with_inconsistent_minor = 11
+    unique_splicing_consistent = 10
+    unique_inconsistent = 11
+    # all_splicing_consistent = 19
     all = 20
-    all_minor = 21
 
     def no_inconsistent(self):
         return self in [CountingStrategy.unique_only, CountingStrategy.with_ambiguous]
 
     def ambiguous(self):
-        return self in [CountingStrategy.all, CountingStrategy.all_minor, CountingStrategy.with_ambiguous]
+        return self in [CountingStrategy.all, CountingStrategy.with_ambiguous]
 
     def inconsistent_minor(self):
-        return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor]
+        return self == CountingStrategy.unique_splicing_consistent
 
     def inconsistent(self):
-        return self in [CountingStrategy.with_inconsistent_minor, CountingStrategy.all_minor,
-                        CountingStrategy.with_inconsistent, CountingStrategy.all]
+        return self in [CountingStrategy.unique_splicing_consistent,
+                        CountingStrategy.unique_inconsistent, CountingStrategy.all]
 
 
 COUNTING_STRATEGIES = [CountingStrategy.unique_only.name,
                        CountingStrategy.with_ambiguous.name,
-                       CountingStrategy.with_inconsistent.name,
+                       CountingStrategy.unique_splicing_consistent.name,
+                       CountingStrategy.unique_inconsistent.name,
                        CountingStrategy.all.name]
 
 
@@ -94,19 +95,17 @@ def confirms_feature(read_assignment):
 
 
 class ReadWeightCounter:
-    def __init__(self, strategy_str, relax_strategy=True):
+    def __init__(self, strategy_str):
         self.strategy = CountingStrategy[strategy_str]
-        if relax_strategy:
-            if self.strategy == CountingStrategy.with_inconsistent:
-                self.strategy = CountingStrategy.with_inconsistent_minor
-            elif self.strategy == CountingStrategy.all:
-                self.strategy = CountingStrategy.all_minor
         self.strategy_flags = CountingStrategyFlags(self.strategy)
 
     def process_inconsistent(self, assignment_type, feature_count):
         # use only for inconsistent assignments
         if assignment_type == ReadAssignmentType.inconsistent_ambiguous or feature_count > 1:
-            return 0.0
+            if self.strategy_flags.use_ambiguous:
+                return 1.0 / feature_count
+            else:
+                return 0.0
         if self.strategy_flags.use_inconsistent:
             return 1.0
         if self.strategy_flags.use_inconsistent_minor and assignment_type == ReadAssignmentType.inconsistent_non_intronic:
@@ -365,15 +364,15 @@ def convert_counts_to_tpm(self):
 
 def create_gene_counter(output_file_name, strategy, complete_feature_list=None,
                         read_groups=None, ignore_read_groups=False, output_zeroes=True):
-    read_weight_counter = ReadWeightCounter(strategy, relax_strategy=False)
+    read_weight_counter = ReadWeightCounter(strategy)
     return AssignedFeatureCounter(output_file_name, GeneAssignmentExtractor,
                                   read_groups, read_weight_counter,
                                   complete_feature_list, ignore_read_groups, output_zeroes)
 
 
 def create_transcript_counter(output_file_name, strategy, complete_feature_list=None,
                               read_groups=None, ignore_read_groups=False, output_zeroes=True):
-    read_weight_counter = ReadWeightCounter(strategy, relax_strategy=True)
+    read_weight_counter = ReadWeightCounter(strategy)
     return AssignedFeatureCounter(output_file_name, TranscriptAssignmentExtractor,
                                   read_groups, read_weight_counter,
                                   complete_feature_list, ignore_read_groups, output_zeroes)