Merge pull request #125 from ncsa/develop

small refactoring for Markov model in development
ncsa · Sep 21, 2024 · 5f9f85a · 5f9f85a
2 parents bf36496 + 2d8c252
commit 5f9f85a
Show file tree

Hide file tree

Showing 15 changed files with 1,062 additions and 917 deletions.
diff --git a/neat/model_sequencing_error/runner.py b/neat/model_sequencing_error/runner.py
@@ -14,7 +14,7 @@
 
 from .utils import parse_file
 from ..common import validate_output_path, validate_input_path
-from ..models import SequencingErrorModel
+from ..models import SequencingErrorModel, TraditionalQualityModel
 from ..variants import Insertion, Deletion, SingleNucleotideVariant
 
 __all__ = [
@@ -89,17 +89,7 @@ def model_seq_err_runner(
     validate_output_path(output_prefix, is_file=False)
     output_prefix = Path(output_prefix)
 
-    """
-    Previously, this was
-    
-    output_file = Path(output_prefix).with_suffix('.p.gz')
-    
-    But this tended to drop parts of the name, if they included dots: 
-    e.g., output_prefix = "/path/to/samplename.testrun" outputted to /path/to/samplename.p.gz
-    
-    This way avoids that issue and outputs to /path/to/samplename.testrun.p.gz, 
-    even though it is a little more cumbersome
-    """
+    # used string logic instead of pathlib here bc pathlib was cutting off extensions.
     output_file = output_prefix.parent / f'{output_prefix.name}.p.gz'
     validate_output_path(output_file, overwrite=overwrite)
     _LOG.info(f'Writing output to: {output_file}')
@@ -116,8 +106,7 @@ def model_seq_err_runner(
             final_quality_scores,
             num_records_to_process,
             offset,
-            read_length,
-            binned_scores
+            read_length
         )
 
         read_parameters.append(parameters_by_position)
@@ -157,23 +146,42 @@ def model_seq_err_runner(
     seq_err_model = SequencingErrorModel(
         avg_seq_error=average_error,
         read_length=read_length,
+    )
+
+    # Just the default model
+    qual_score_model = TraditionalQualityModel(
+        average_error=average_error,
         quality_scores=np.array(final_quality_scores),
         qual_score_probs=read_parameters[0],
     )
 
     if len(files) == 1:
         with gzip.open(output_file, 'w') as out_model:
-            pickle.dump([seq_err_model, None], out_model)
+            pickle.dump({
+                "error_model1": seq_err_model,
+                "error_model2": None,
+                "qual_score_model1": qual_score_model,
+                "qual_score_model2": None
+            }, out_model)
 
     else:
         # Second model if a second input was given
         seq_err_model_r2 = SequencingErrorModel(
             avg_seq_error=average_error,
             read_length=read_length,
+        )
+
+        qual_score_model_r2 = TraditionalQualityModel(
+            average_error=average_error,
             quality_scores=np.array(final_quality_scores),
             qual_score_probs=read_parameters[1]
         )
         with gzip.open(output_file, 'w') as out_model:
-            pickle.dump([seq_err_model, seq_err_model_r2], out_model)
+            pickle.dump({
+                "error_model1": seq_err_model,
+                "error_model2": seq_err_model_r2,
+                "qual_score_model1": qual_score_model,
+                "qual_score_model2": qual_score_model_r2
+            }, out_model)
 
     _LOG.info("Modeling sequencing errors is complete, have a nice day.")
diff --git a/neat/model_sequencing_error/utils.py b/neat/model_sequencing_error/utils.py
@@ -189,12 +189,12 @@ def plot_stuff(init_q, real_q, q_range, prob_q, actual_readlen, plot_path):
     plt.rcParams.update({'font.size': 14, 'font.weight': 'bold', 'lines.linewidth': 3})
 
     plt.figure(1)
-    Z = np.array(init_q).T
-    X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
-    plt.pcolormesh(X, Y, Z, vmin=0., vmax=0.25)
-    plt.axis([0, len(Z[0]), 0, len(Z)])
-    plt.yticks(range(0, len(Z), 10), range(0, len(Z), 10))
-    plt.xticks(range(0, len(Z[0]), 10), range(0, len(Z[0]), 10))
+    z = np.array(init_q).T
+    x, y = np.meshgrid(range(0, len(z[0]) + 1), range(0, len(z) + 1))
+    plt.pcolormesh(x, Y, z, vmin=0., vmax=0.25)
+    plt.axis([0, len(z[0]), 0, len(z)])
+    plt.yticks(range(0, len(z), 10), range(0, len(z), 10))
+    plt.xticks(range(0, len(z[0]), 10), range(0, len(z[0]), 10))
     plt.xlabel('Read Position')
     plt.ylabel('Quality Score')
     plt.title('Q-Score Prior Probabilities')
@@ -233,7 +233,7 @@ def plot_stuff(init_q, real_q, q_range, prob_q, actual_readlen, plot_path):
 
         plt.figure(p + 1)
         z = np.log10(current_data)
-        x, y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
+        x, y = np.meshgrid(range(0, len(z[0]) + 1), range(0, len(z) + 1))
         plt.pcolormesh(x, y, z[::-1, :], vmin=v_min_log[0], vmax=v_min_log[1], cmap='jet')
         plt.xlim([q_range[0], q_range[1] + 1])
         plt.ylim([real_q - q_range[1] - 1, real_q - q_range[0]])

diff --git a/neat/models/__init__.py b/neat/models/__init__.py
@@ -1 +1,3 @@
-from .models import *
+from .error_models import *
+from .mutation_model import *
+from .fragment_model import *
diff --git a/neat/models/default_quality_score_model.py b/neat/models/default_quality_score_model.py
@@ -0,0 +1,170 @@
+import numpy as np
+
+# The following default model parameters are here, for tweaking and revision.
+
+# This is based on the input model NEAT was using previously
+default_error_transition_matrix = np.array(
+    [[0.0, 0.4918, 0.3377, 0.1705],
+     [0.5238, 0.0, 0.2661, 0.2101],
+     [0.3755, 0.2355, 0.0, 0.389],
+     [0.2505, 0.2552, 0.4943, 0.0]]
+)
+
+# All numbers from 1-42
+default_quality_scores = np.arange(1, 43)
+
+
+default_qual_score_probs = np.array([
+       [35.20551064,  5.39726466],
+       [35.05310236,  5.63622973],
+       [35.03595806,  5.66646784],
+       [34.98022112,  5.74627755],
+       [34.97572106,  5.75463045],
+       [34.4884581,  6.41083843],
+       [35.01230914,  5.69464369],
+       [35.05956044,  5.61320501],
+       [35.06796147,  5.57262371],
+       [34.96093227,  5.72592418],
+       [34.94813878,  5.75079686],
+       [34.96537761,  5.74288137],
+       [35.04818471,  5.64898391],
+       [35.05231225,  5.64270542],
+       [35.01482501,  5.69340856],
+       [34.9825929,  5.73638614],
+       [34.98442388,  5.7366054],
+       [35.03409773,  5.66869673],
+       [35.0844361,  5.54805317],
+       [35.08961879,  5.53743692],
+       [35.0466029,  5.60379104],
+       [34.99117371,  5.69895578],
+       [34.9815548,  5.71482919],
+       [34.98911734,  5.7011922],
+       [34.93751495,  5.78304943],
+       [34.93036155,  5.79838573],
+       [34.92922972,  5.8309752],
+       [34.96684586,  5.76060028],
+       [34.9708005,  5.74962913],
+       [34.95329795,  5.7652556],
+       [34.92443461,  5.83257523],
+       [34.91492225,  5.84652014],
+       [34.8549109,  5.91585614],
+       [34.94560666,  5.76479635],
+       [34.96711896,  5.72745932],
+       [34.88901206,  5.85709602],
+       [34.93905948,  5.76436017],
+       [34.94363806,  5.7506911],
+       [34.89343347,  5.80958086],
+       [34.95898364,  5.77013784],
+       [34.9618132,  5.77057502],
+       [34.9055352,  5.82500424],
+       [34.95546906,  5.78985858],
+       [34.96132197,  5.77845029],
+       [34.87531478,  5.88705352],
+       [34.84007611,  5.98736108],
+       [34.84061082,  5.98726021],
+       [34.89358007,  5.83662558],
+       [34.88615263,  5.87565349],
+       [34.88569617,  5.87220656],
+       [34.80492984,  6.03141243],
+       [34.88045333,  5.87773523],
+       [34.86089018,  5.92644241],
+       [34.85899099,  5.93253766],
+       [34.85281461,  5.95985895],
+       [34.8233026,  5.92281158],
+       [34.81999927,  5.93027502],
+       [34.86737867,  5.89225314],
+       [34.84964333,  5.94130026],
+       [34.8416016,  5.95394121],
+       [34.83298563,  5.93068537],
+       [34.85175139,  5.91356723],
+       [34.85312089,  5.91332882],
+       [34.81363703,  5.99487116],
+       [34.84683162,  5.91835211],
+       [34.8503877,  5.90848694],
+       [34.75132727,  6.05903999],
+       [34.69121836,  6.10534185],
+       [34.69027134,  6.10258631],
+       [34.55838196,  6.33738628],
+       [34.66787243,  6.16328276],
+       [34.67600762,  6.14921298],
+       [34.69141004,  6.12850504],
+       [34.69170575,  6.11885946],
+       [34.69627971,  6.11449364],
+       [34.61730669,  6.25960228],
+       [34.65572245,  6.15978888],
+       [34.66272688,  6.15123117],
+       [34.61673377,  6.26945453],
+       [34.50737271,  6.36690753],
+       [34.49273489,  6.38753577],
+       [34.53277647,  6.35271811],
+       [34.55186016,  6.29667187],
+       [34.5545907,  6.29615587],
+       [34.42162184,  6.48769325],
+       [34.42162184,  6.48769325],
+       [34.41120476,  6.51973428],
+       [34.41001927,  6.52218756],
+       [34.50872609,  6.36667517],
+       [34.34054261,  6.55724021],
+       [34.31472999,  6.59706435],
+       [34.44800289,  6.370829],
+       [34.38082743,  6.48116784],
+       [34.36736783,  6.50225428],
+       [34.39091046,  6.49333254],
+       [34.14685981,  6.81443525],
+       [34.11286757,  6.86040777],
+       [34.03892364,  6.9167951],
+       [34.09461999,  6.84357267],
+       [34.11624969,  6.81739307],
+       [34.03311938,  6.89976027],
+       [34.19285701,  6.73526643],
+       [34.0871172,  6.85069533],
+       [34.06668596,  6.8773227],
+       [33.96060364,  6.9642502],
+       [33.96399568,  6.94576564],
+       [33.96784324,  6.94562026],
+       [33.90260425,  7.05523946],
+       [33.84443023,  7.10595304],
+       [33.82584028,  7.12368594],
+       [33.83180939,  7.14712464],
+       [33.80809054,  7.14842046],
+       [33.80740632,  7.14768389],
+       [33.77340509,  7.16367224],
+       [33.68690403,  7.30024217],
+       [33.67144094,  7.32120751],
+       [33.71053312,  7.25308594],
+       [33.67084687,  7.25185612],
+       [33.67146605,  7.24695417],
+       [33.50354352,  7.43740665],
+       [33.43878783,  7.43451809],
+       [33.41994711,  7.45122696],
+       [33.41239779,  7.46949964],
+       [33.2412823,  7.64587631],
+       [33.21334153,  7.67937435],
+       [33.26008323,  7.62315701],
+       [33.07948592,  7.80559543],
+       [33.04336633,  7.83646848],
+       [33.06584588,  7.77135486],
+       [32.75754503,  8.02442237],
+       [32.69941899,  8.08132663],
+       [32.32145073,  8.41139017],
+       [32.29822443,  8.37790239],
+       [32.27939786,  8.38937451],
+       [32.12769178,  8.50162153],
+       [31.96975037,  8.62588371],
+       [31.91995927,  8.66055964],
+       [31.90028244,  8.63664978],
+       [31.95031161,  8.63975579],
+       [31.9793546,  8.62708864],
+       [32.01732911,  8.59965107],
+       [31.90794492,  8.63745435],
+       [31.86247331,  8.66495217],
+       [31.8237584,  8.71014014],
+       [31.7715014,  8.75323658],
+       [31.75047688,  8.77579462],
+       [31.50448208,  8.92324659],
+       [31.49927017,  8.91928077],
+       [31.48842743,  8.92882862],
+       [30.88872613,  9.25271538],
+       [28.51071465, 10.2517433]
+])
+