Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collapse2 #171

Merged
merged 3 commits into from
Aug 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions src/eventalign.c
Original file line number Diff line number Diff line change
Expand Up @@ -1659,6 +1659,20 @@ void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t w
fprintf(fp, "\n");
}

// Note: reference_kmer will be same as the model_kmer in this mode
// Also, event_level_mean and event_stdv are always scaled to the model (--scale-events is implicit)
void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index)
{
fprintf(fp, "%s\t%s\t%s\t%s\t", "contig", "position", "reference_kmer",
(print_read_names? "read_name" : "read_index"));
fprintf(fp, "%s\t%s\t%s\t", "event_level_mean", "event_stdv", "event_length");

if(write_signal_index) {
fprintf(fp, "\t%s\t%s", "start_idx", "end_idx");
}

fprintf(fp, "\n");
}

typedef struct {
uint64_t start_raw;
Expand Down Expand Up @@ -2161,6 +2175,132 @@ char *emit_event_alignment_tsv(uint32_t strand_idx,
return sp->s;
}

// algo:
// 1. Remove rows in eventalign.tsv where reference kmer does not match model kmer
// 2. For each row in eventalign.tsv, length of each event = end_idx - start_idx
// 3. Group by each read and each reference position
// - collapsed_event_level_mean = sum(event_level_mean * length) / sum(length)
// - collapsed_event_stdv = sum(event_stdv * length) / sum(length)
// - collapsed_event_duration = sum(event_duration * length) / sum(length)
//cleanup unused function param shit
char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx,
const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name,float sample_rate, float *rawptr)
{

kstring_t str;
kstring_t *sp = &str;
str_init(sp, sizeof(char)*alignments.size()*120);

size_t n_collapse = 1;
for(size_t i = 0; i < alignments.size(); i+=n_collapse) {

const event_alignment_t& ea = alignments[i];

// basic information
if (!print_read_names)
{
sprintf_append(sp, "%s\t%d\t%s\t%ld\t",
ref_name, //ea.ref_name.c_str(),
ea.ref_position,
ea.ref_kmer,
(long)read_index);
}
else
{
sprintf_append(sp, "%s\t%d\t%s\t%s\t",
ref_name, //ea.ref_name.c_str(),
ea.ref_position,
ea.ref_kmer,
read_name //sr.read_name.c_str(),
);
}

// event information
uint64_t length = 0;
float event_mean = 0;
float event_stdv = 0;
float event_duration = 0;

// if(strcmp(ea.ref_kmer,ea.model_kmer)==0){ //the ref and model kmers should match
// uint64_t len_curr = (uint64_t)((et->event)[ea.event_idx].length);
// length += len_curr;
// event_mean += get_fully_scaled_level((et->event)[ea.event_idx].mean,scalings) * len_curr;
// event_stdv += (et->event)[ea.event_idx].stdv * len_curr;
// event_duration += get_duration_seconds(et, ea.event_idx, sample_rate) * len_curr;
// }

// uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size);
// float model_mean = 0.0;
// float model_stdv = 0.0;
// // unscaled model parameters
// if(ea.hmm_state != 'B') {
// model_t model1 = model[rank];
// model_mean = model1.level_mean;
// model_stdv = model1.level_stdv;
// }

uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive
uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive


n_collapse = 0;
while (i + n_collapse < alignments.size() && ea.ref_position == alignments[i+n_collapse].ref_position){
assert(strcmp(ea.ref_kmer,alignments[i+n_collapse].ref_kmer)==0);
// if(strcmp(ea.model_kmer,alignments[i+n_collapse].model_kmer)!=0){ //TODO: NNNN kmers must be handled
// fprintf(stderr, "model kmer does not match! %s vs %s\n",ea.model_kmer,alignments[i+n_collapse].model_kmer);
// }
const event_alignment_t& ea_curr = alignments[i+n_collapse];

if(strcmp(ea_curr.ref_kmer,ea_curr.model_kmer)==0){ //the ref and model kmers should match
uint64_t len_curr = (uint64_t)((et->event)[ea_curr.event_idx].length);
length += len_curr;
event_mean += get_fully_scaled_level((et->event)[ea_curr.event_idx].mean,scalings) * len_curr;
event_stdv += (et->event)[ea_curr.event_idx].stdv * len_curr;
event_duration += get_duration_seconds(et, ea_curr.event_idx, sample_rate) * len_curr;
}

n_collapse++;
}
event_mean /= length;
event_stdv /= length;
event_duration /= length;


// float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv);
sprintf_append(sp, "%.2f\t%.3f\t%.5f\t", event_mean, event_stdv, event_duration);
// sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer,
// model_mean,
// model_stdv,
// standard_level);

if(write_signal_index) {
if(n_collapse > 1){
uint64_t start_idx1 = start_idx;
uint64_t end_idx1 = end_idx;

const event_alignment_t& ea2 = alignments[i+n_collapse-1];
uint64_t start_idx2 = (et->event)[ea2.event_idx].start;
uint64_t end_idx2 = (et->event)[ea2.event_idx].start + (uint64_t)((et->event)[ea2.event_idx].length);

//min
start_idx = start_idx1 < start_idx2 ? start_idx1 : start_idx2;
//max
end_idx = end_idx1 > end_idx2 ? end_idx1 : end_idx2;
}
sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx);
}

sprintf_append(sp, "\n");
}


//str_free(sp); //freeing is later done in free_db_tmp()
return sp->s;
}


char *emit_event_alignment_paf(const event_table* et, int64_t len_raw_signal, int64_t ref_len, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments, bam1_t* bam_record, char* read_name, char *ref_name, int8_t rna)
Expand Down
4 changes: 4 additions & 0 deletions src/f5c.c
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,7 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){
int8_t write_signal_index = (core->opt.flag & F5C_PRINT_SIGNAL_INDEX) ? 1 : 0;
int8_t sam_output = (core->opt.flag & F5C_SAM) ? 1 : 0;
int8_t paf_output = (core->opt.flag & F5C_PAF) ? 1 : 0;
int8_t m6anet_output = (core->opt.flag & F5C_M6ANET) ? 1 : 0;
int8_t rna = (core->opt.flag & F5C_RNA) ? 1 : 0;

if(paf_output){
Expand All @@ -872,6 +873,9 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){
int8_t sam_out_version = core->opt.sam_out_version;
int64_t ref_len = core->m_hdr->target_len[db->bam_rec[i]->core.tid];
db->event_alignment_result_str[i] = emit_event_alignment_sam(qname, core->m_hdr, db->bam_rec[i], *event_alignment_result, sam_out_version, &(db->et[i]), db->sig[i]->nsample, ref_len, rna, db->scalings[i]);
} else if (m6anet_output){
db->event_alignment_result_str[i] = emit_event_alignment_tsv_m6anet(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events,
db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr);
} else {
db->event_alignment_result_str[i] = emit_event_alignment_tsv(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events,
db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr);
Expand Down
1 change: 1 addition & 0 deletions src/f5c.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
#define F5C_COLLAPSE_EVENTS 0x20000 //collapse events
#define F5C_R10 0x40000 //r10
#define F5C_PAF 0x80000 //paf (eventalign only)
#define F5C_M6ANET 0x100000 //m6anet (eventalign only)

/*************************************************************
* flags for a read status (related to db_t->read_stat_flag) *
Expand Down
6 changes: 6 additions & 0 deletions src/f5cmisc.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,13 @@ char *emit_event_alignment_tsv(uint32_t strand_idx,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples);
char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx,
const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples);
void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t write_samples, int8_t write_signal_index);
void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index);
void emit_sam_header(samFile* fp, const bam_hdr_t* hdr);
char *emit_event_alignment_sam(char* read_name, bam_hdr_t* base_hdr, bam1_t* base_record,
const std::vector<event_alignment_t>& alignments, int8_t sam_out_version,
Expand Down
19 changes: 19 additions & 0 deletions src/meth_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ static struct option long_options[] = {
{"pore",required_argument,0,0}, //46 pore
{"paf",no_argument,0,'c'}, //47 if print in paf format (only for eventalign)
{"sam-out-version",required_argument,0,0}, //48 specify the version of the sam output for eventalign (eventalign only)
{"m6anet",no_argument,0,0}, //49 m6anet output (eventalign only)
{0, 0, 0, 0}};


Expand Down Expand Up @@ -447,6 +448,12 @@ int meth_main(int argc, char* argv[], int8_t mode) {
exit(EXIT_FAILURE);
}
is_sam_out_version_set = 1;
} else if (c == 0 && longindex == 49){ //m6anet output
if(mode!=1){
ERROR("%s","Option --m6anet is available only in eventalign");
exit(EXIT_FAILURE);
}
yes_or_no(&opt, F5C_M6ANET, longindex, "yes", 1);
}
}

Expand Down Expand Up @@ -513,6 +520,7 @@ int meth_main(int argc, char* argv[], int8_t mode) {
fprintf(fp_help," --paf write output in PAF format\n");
fprintf(fp_help," --sam write output in SAM format\n");
fprintf(fp_help," --sam-out-version INT sam output version (set 1 to revert to old nanopolish style format) [%d]\n",opt.meth_out_version);
fprintf(fp_help," --m6anet write output in m6anet format\n");

fprintf(fp_help," --print-read-names print read names instead of indexes\n");
fprintf(fp_help," --scale-events scale events to the model, rather than vice-versa\n");
Expand Down Expand Up @@ -573,16 +581,27 @@ int meth_main(int argc, char* argv[], int8_t mode) {
int8_t write_signal_index = (core->opt.flag & F5C_PRINT_SIGNAL_INDEX) ? 1 : 0;
int8_t sam_output = (core->opt.flag & F5C_SAM) ? 1 : 0 ;
int8_t paf_output = (core->opt.flag & F5C_PAF) ? 1 : 0 ;
int8_t m6anet_output = (core->opt.flag & F5C_M6ANET) ? 1 : 0 ;

if(sam_output && paf_output){
ERROR("%s","-c and --sam cannot be used together");
exit(EXIT_FAILURE);
}
if(sam_output && m6anet_output){
ERROR("%s","-c and --m6anet cannot be used together");
exit(EXIT_FAILURE);
}
if(paf_output && m6anet_output){
ERROR("%s","--paf and --m6anet cannot be used together");
exit(EXIT_FAILURE);
}

if(sam_output){
emit_sam_header(core->sam_output, core->m_hdr);
} else if (paf_output){
//none
} else if (m6anet_output){
emit_event_alignment_tsv_m6anet_header(stdout, print_read_names, write_signal_index);
} else{
emit_event_alignment_tsv_header(stdout, print_read_names, write_samples, write_signal_index);
}
Expand Down
Loading