Skip to content

Commit

Permalink
Merge pull request #125 from ncsa/develop
Browse files Browse the repository at this point in the history
small refactoring for Markov model in development
  • Loading branch information
joshfactorial authored Sep 21, 2024
2 parents bf36496 + 2d8c252 commit 5f9f85a
Show file tree
Hide file tree
Showing 15 changed files with 1,062 additions and 917 deletions.
40 changes: 24 additions & 16 deletions neat/model_sequencing_error/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from .utils import parse_file
from ..common import validate_output_path, validate_input_path
from ..models import SequencingErrorModel
from ..models import SequencingErrorModel, TraditionalQualityModel
from ..variants import Insertion, Deletion, SingleNucleotideVariant

__all__ = [
Expand Down Expand Up @@ -89,17 +89,7 @@ def model_seq_err_runner(
validate_output_path(output_prefix, is_file=False)
output_prefix = Path(output_prefix)

"""
Previously, this was
output_file = Path(output_prefix).with_suffix('.p.gz')
But this tended to drop parts of the name, if they included dots:
e.g., output_prefix = "/path/to/samplename.testrun" outputted to /path/to/samplename.p.gz
This way avoids that issue and outputs to /path/to/samplename.testrun.p.gz,
even though it is a little more cumbersome
"""
# used string logic instead of pathlib here bc pathlib was cutting off extensions.
output_file = output_prefix.parent / f'{output_prefix.name}.p.gz'
validate_output_path(output_file, overwrite=overwrite)
_LOG.info(f'Writing output to: {output_file}')
Expand All @@ -116,8 +106,7 @@ def model_seq_err_runner(
final_quality_scores,
num_records_to_process,
offset,
read_length,
binned_scores
read_length
)

read_parameters.append(parameters_by_position)
Expand Down Expand Up @@ -157,23 +146,42 @@ def model_seq_err_runner(
seq_err_model = SequencingErrorModel(
avg_seq_error=average_error,
read_length=read_length,
)

# Just the default model
qual_score_model = TraditionalQualityModel(
average_error=average_error,
quality_scores=np.array(final_quality_scores),
qual_score_probs=read_parameters[0],
)

if len(files) == 1:
with gzip.open(output_file, 'w') as out_model:
pickle.dump([seq_err_model, None], out_model)
pickle.dump({
"error_model1": seq_err_model,
"error_model2": None,
"qual_score_model1": qual_score_model,
"qual_score_model2": None
}, out_model)

else:
# Second model if a second input was given
seq_err_model_r2 = SequencingErrorModel(
avg_seq_error=average_error,
read_length=read_length,
)

qual_score_model_r2 = TraditionalQualityModel(
average_error=average_error,
quality_scores=np.array(final_quality_scores),
qual_score_probs=read_parameters[1]
)
with gzip.open(output_file, 'w') as out_model:
pickle.dump([seq_err_model, seq_err_model_r2], out_model)
pickle.dump({
"error_model1": seq_err_model,
"error_model2": seq_err_model_r2,
"qual_score_model1": qual_score_model,
"qual_score_model2": qual_score_model_r2
}, out_model)

_LOG.info("Modeling sequencing errors is complete, have a nice day.")
14 changes: 7 additions & 7 deletions neat/model_sequencing_error/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,12 @@ def plot_stuff(init_q, real_q, q_range, prob_q, actual_readlen, plot_path):
plt.rcParams.update({'font.size': 14, 'font.weight': 'bold', 'lines.linewidth': 3})

plt.figure(1)
Z = np.array(init_q).T
X, Y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
plt.pcolormesh(X, Y, Z, vmin=0., vmax=0.25)
plt.axis([0, len(Z[0]), 0, len(Z)])
plt.yticks(range(0, len(Z), 10), range(0, len(Z), 10))
plt.xticks(range(0, len(Z[0]), 10), range(0, len(Z[0]), 10))
z = np.array(init_q).T
x, y = np.meshgrid(range(0, len(z[0]) + 1), range(0, len(z) + 1))
plt.pcolormesh(x, Y, z, vmin=0., vmax=0.25)
plt.axis([0, len(z[0]), 0, len(z)])
plt.yticks(range(0, len(z), 10), range(0, len(z), 10))
plt.xticks(range(0, len(z[0]), 10), range(0, len(z[0]), 10))
plt.xlabel('Read Position')
plt.ylabel('Quality Score')
plt.title('Q-Score Prior Probabilities')
Expand Down Expand Up @@ -233,7 +233,7 @@ def plot_stuff(init_q, real_q, q_range, prob_q, actual_readlen, plot_path):

plt.figure(p + 1)
z = np.log10(current_data)
x, y = np.meshgrid(range(0, len(Z[0]) + 1), range(0, len(Z) + 1))
x, y = np.meshgrid(range(0, len(z[0]) + 1), range(0, len(z) + 1))
plt.pcolormesh(x, y, z[::-1, :], vmin=v_min_log[0], vmax=v_min_log[1], cmap='jet')
plt.xlim([q_range[0], q_range[1] + 1])
plt.ylim([real_q - q_range[1] - 1, real_q - q_range[0]])
Expand Down
4 changes: 3 additions & 1 deletion neat/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .models import *
from .error_models import *
from .mutation_model import *
from .fragment_model import *
170 changes: 170 additions & 0 deletions neat/models/default_quality_score_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import numpy as np

# The following default model parameters are here, for tweaking and revision.

# This is based on the input model NEAT was using previously
default_error_transition_matrix = np.array(
[[0.0, 0.4918, 0.3377, 0.1705],
[0.5238, 0.0, 0.2661, 0.2101],
[0.3755, 0.2355, 0.0, 0.389],
[0.2505, 0.2552, 0.4943, 0.0]]
)

# All numbers from 1-42
default_quality_scores = np.arange(1, 43)


default_qual_score_probs = np.array([
[35.20551064, 5.39726466],
[35.05310236, 5.63622973],
[35.03595806, 5.66646784],
[34.98022112, 5.74627755],
[34.97572106, 5.75463045],
[34.4884581, 6.41083843],
[35.01230914, 5.69464369],
[35.05956044, 5.61320501],
[35.06796147, 5.57262371],
[34.96093227, 5.72592418],
[34.94813878, 5.75079686],
[34.96537761, 5.74288137],
[35.04818471, 5.64898391],
[35.05231225, 5.64270542],
[35.01482501, 5.69340856],
[34.9825929, 5.73638614],
[34.98442388, 5.7366054],
[35.03409773, 5.66869673],
[35.0844361, 5.54805317],
[35.08961879, 5.53743692],
[35.0466029, 5.60379104],
[34.99117371, 5.69895578],
[34.9815548, 5.71482919],
[34.98911734, 5.7011922],
[34.93751495, 5.78304943],
[34.93036155, 5.79838573],
[34.92922972, 5.8309752],
[34.96684586, 5.76060028],
[34.9708005, 5.74962913],
[34.95329795, 5.7652556],
[34.92443461, 5.83257523],
[34.91492225, 5.84652014],
[34.8549109, 5.91585614],
[34.94560666, 5.76479635],
[34.96711896, 5.72745932],
[34.88901206, 5.85709602],
[34.93905948, 5.76436017],
[34.94363806, 5.7506911],
[34.89343347, 5.80958086],
[34.95898364, 5.77013784],
[34.9618132, 5.77057502],
[34.9055352, 5.82500424],
[34.95546906, 5.78985858],
[34.96132197, 5.77845029],
[34.87531478, 5.88705352],
[34.84007611, 5.98736108],
[34.84061082, 5.98726021],
[34.89358007, 5.83662558],
[34.88615263, 5.87565349],
[34.88569617, 5.87220656],
[34.80492984, 6.03141243],
[34.88045333, 5.87773523],
[34.86089018, 5.92644241],
[34.85899099, 5.93253766],
[34.85281461, 5.95985895],
[34.8233026, 5.92281158],
[34.81999927, 5.93027502],
[34.86737867, 5.89225314],
[34.84964333, 5.94130026],
[34.8416016, 5.95394121],
[34.83298563, 5.93068537],
[34.85175139, 5.91356723],
[34.85312089, 5.91332882],
[34.81363703, 5.99487116],
[34.84683162, 5.91835211],
[34.8503877, 5.90848694],
[34.75132727, 6.05903999],
[34.69121836, 6.10534185],
[34.69027134, 6.10258631],
[34.55838196, 6.33738628],
[34.66787243, 6.16328276],
[34.67600762, 6.14921298],
[34.69141004, 6.12850504],
[34.69170575, 6.11885946],
[34.69627971, 6.11449364],
[34.61730669, 6.25960228],
[34.65572245, 6.15978888],
[34.66272688, 6.15123117],
[34.61673377, 6.26945453],
[34.50737271, 6.36690753],
[34.49273489, 6.38753577],
[34.53277647, 6.35271811],
[34.55186016, 6.29667187],
[34.5545907, 6.29615587],
[34.42162184, 6.48769325],
[34.42162184, 6.48769325],
[34.41120476, 6.51973428],
[34.41001927, 6.52218756],
[34.50872609, 6.36667517],
[34.34054261, 6.55724021],
[34.31472999, 6.59706435],
[34.44800289, 6.370829],
[34.38082743, 6.48116784],
[34.36736783, 6.50225428],
[34.39091046, 6.49333254],
[34.14685981, 6.81443525],
[34.11286757, 6.86040777],
[34.03892364, 6.9167951],
[34.09461999, 6.84357267],
[34.11624969, 6.81739307],
[34.03311938, 6.89976027],
[34.19285701, 6.73526643],
[34.0871172, 6.85069533],
[34.06668596, 6.8773227],
[33.96060364, 6.9642502],
[33.96399568, 6.94576564],
[33.96784324, 6.94562026],
[33.90260425, 7.05523946],
[33.84443023, 7.10595304],
[33.82584028, 7.12368594],
[33.83180939, 7.14712464],
[33.80809054, 7.14842046],
[33.80740632, 7.14768389],
[33.77340509, 7.16367224],
[33.68690403, 7.30024217],
[33.67144094, 7.32120751],
[33.71053312, 7.25308594],
[33.67084687, 7.25185612],
[33.67146605, 7.24695417],
[33.50354352, 7.43740665],
[33.43878783, 7.43451809],
[33.41994711, 7.45122696],
[33.41239779, 7.46949964],
[33.2412823, 7.64587631],
[33.21334153, 7.67937435],
[33.26008323, 7.62315701],
[33.07948592, 7.80559543],
[33.04336633, 7.83646848],
[33.06584588, 7.77135486],
[32.75754503, 8.02442237],
[32.69941899, 8.08132663],
[32.32145073, 8.41139017],
[32.29822443, 8.37790239],
[32.27939786, 8.38937451],
[32.12769178, 8.50162153],
[31.96975037, 8.62588371],
[31.91995927, 8.66055964],
[31.90028244, 8.63664978],
[31.95031161, 8.63975579],
[31.9793546, 8.62708864],
[32.01732911, 8.59965107],
[31.90794492, 8.63745435],
[31.86247331, 8.66495217],
[31.8237584, 8.71014014],
[31.7715014, 8.75323658],
[31.75047688, 8.77579462],
[31.50448208, 8.92324659],
[31.49927017, 8.91928077],
[31.48842743, 8.92882862],
[30.88872613, 9.25271538],
[28.51071465, 10.2517433]
])

Loading

0 comments on commit 5f9f85a

Please sign in to comment.