From 3bda739c2d0a5ec83fbdda1369dd19d16c372e64 Mon Sep 17 00:00:00 2001 From: Nicolas Dierckxsens Date: Tue, 10 Apr 2018 14:53:08 +0200 Subject: [PATCH] Create NOVOPlasty2.6.6.pl --- NOVOPlasty2.6.6.pl | 22344 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 22344 insertions(+) create mode 100644 NOVOPlasty2.6.6.pl diff --git a/NOVOPlasty2.6.6.pl b/NOVOPlasty2.6.6.pl new file mode 100644 index 0000000..0991d1a --- /dev/null +++ b/NOVOPlasty2.6.6.pl @@ -0,0 +1,22344 @@ +#!/usr/bin/env perl +###################################################### +# SOFTWARE COPYRIGHT NOTICE AGREEMENT # +# Copyright (C) {2015-2018} {Nicolas Dierckxsens} # +# All Rights Reserved # +# See file LICENSE for details. # +###################################################### +# NOVOPlasty - The Organelle Assembler +# nicolasdierckxsens@hotmail.com +use Getopt::Long; +use strict; + +my $left = '9'; +my $right = '15'; +my $insert_range = '1.8'; +my $insert_range_back = '1.8'; +my $noseed = ""; +my $iterations = "1000000"; +my $startprint = "1000000"; +my $startprint2 = "1000000"; +my $max_memory; +my $option = '1'; +my $count_coverage = '0'; +my $total_extensions; +my $average_coverage_ext; +my $platform = "illumina"; +my $variance_detection; +my $chromosome; +my $heteroplasmy; +my $hp_seed_assemble; +my %SNPs; +my %accepted_SNPs; +my %accepted_SNPs_pair; +my %accepted_SNPs_back; +my %accepted_SNPs_pair_back; +my $hp_seed_assemble_last_chance; +my $current_pos = '0'; +my $hp_first_SNP; +my %linked_SNPs; +my %not_linked_SNPs; +my %linked_half_SNPs2; +my $linked_half_SNPs2_check; +my %linked_half_SNPs; +my %linked_half_SNPs_exclude; +my $first_linked_half_SNP_pos; +my $first_linked_half_SNP_read; +my $use_quality; +my %A; +my %C; +my %T; +my %G; +my %SNR_regions_hp; +my $hp_back; +my $ref_skip_before; +my $ref_skip_before_back; +my $benchmark_time = "yes"; +my $time_start_seed; +my $time_before_merge; +my $time_before_scan; +my $time_end_hash_scan; +my $time_collect_ext; +my $time_back; + +my $insert_size_correct = "yes"; +my $overlap; +my $insert_range_b; +my $insert_range_c; +my $genome_range_low; +my $genome_range_high; +my $genome_range ; +my $seed_input0; +my $seed_input; +my $insert_size; +my $project; +my $read_length; +my $type; +my $encrypt = "no"; +my $paired; +my $print_log; +my $save_reads; +my $bad_read; +my %contigs_id; +my %contigs_end; +my $contig_num = '1'; +my $id = ""; +my $id_pair = ""; +my $id_match = ""; +my $contig_end_check; + +my %seed; +my %seed_old; +my $counttest1 = '0'; +my $indel_split = '0'; +my $indel_split_skip; +my %indel_split_skip; +my %indel_split; +my $indel_split_back = '0'; +my $indel_split_skip_back; +my %indel_split_skip_back; +my %indel_split_back; +my $insert_range2 = $insert_range; +my $SNR_read; +my $SNR_read_back; +my $SNR_read2; +my $SNR_read_back2; +my %best_extension_prev; +my %best_extension_back_prev; +my %contig_gap_min; +my %contig_gap_max; +my %contig_count; +my $contig_count; +my $progress_before; +my $contig_id2; +my $contig_id1; +my $contig_id3; +my $contig_id4; +my %seed_split; +my $lastbit_contig_prev; +my $split_forward; +my %old_id; +my %old_id2; +my %old_rep; +my %old_rep_old; +my %SNR_length; +my @insert_size; +my %regex; +my %regex_back; +my %regex1; +my %regexb; +my %regex2b; +my %noback; +my %noforward; +my %SNR; +my %SNR_back; +my $SNR_nucleo; +my $SNR_nucleo_back; +my %id_bad; +my $y ='1'; +my %position; +my %position_back; +my %bad_read; +my %position_adjust; +my %read_end_tmp; +my %read_end_b_tmp; +my %read_short_end_tmp; +my %read_short_zone_tmp; +my %read_start_tmp; +my %read_start_b_tmp; +my %read_short_start_tmp; +my %read_short_zone_start_tmp; +my %tree; +my %contig_id2; +my %contig_id1; +my %contig_id3; +my %contig_id4; +my %hash; +my %hash2b; +my %hash2c; +my @row; +my %row; +my %match_rep; +my %count_rep; +my %before; +my %before_back; +my %first_before; +my %SNP_active; +my %first_before_back; +my %SNP_active_back; +my %filter_before1; +my %filter_before2; +my %filter_before3; +my %filter_before4; +my %filter_before1_pair; +my %filter_before2_pair; +my %filter_before3_pair; +my %filter_before4_pair; +my %nosecond; +my %repetitive_pair; +my %count_reads; +my %count_reads_all; +my $assembly_length = '1'; +my $assembly_success; +my %seeds_check; +my %save_reads; +my %contigs; +my %variance_forward; +my %variance_back; +my %last_ref_seq_forward; +my %last_ref_seq_back; +my %last_pos_seq_forward; +my %last_pos_seq_back; +my %large_variance_forward; +my %large_variance_back; +my %large_variance_length_forward; +my %large_variance_length_back; +my %last_ref_pos_forward; +my %last_ref_pos_back; +my %allele_percentage; +my %allele_percentage_back; +my %allele_total; +my %allele_total_back; +my %variance_all; +my %variance_all_SNP; +my %split_positions; +my %no_next_seed; +my %remove_extension_mismatch; + +my $reads12; +my $reads1; +my $reads2; +my $config; +my $read; +my $deletion; +my $deletion_back; +my $reference_guided; +my $reference_guided_back; +my $contig_read2; +my $contig_read1; +my $contig_read3; +my $contig_read4; +my $first_contig_start; +my $first_contig_start_reverse; +my $finish; +my $repetitive_detect; +my $repetitive_detect_back; +my $repetitive_detect2; +my $repetitive_detect_back2; +my $contig_end; +my $repetitive; +my $before_repetitive; +my $before_repetitive_short; +my $CP_check; +my $before_extension1; +my $before_extension2; +my $before_extension_back1; +my $before_extension_back2; +my $id_split1; +my $id_split2; +my $id_split3; +my $before; +my $before_back; +my $end_repetitive_tmp4; +my $first_before; +my $SNP_active; +my $first_before_back; +my $SNP_active_back; +my $nosecond; +my $first_contig_id; +my $no_contig_id2; +my $no_contig_id1; +my $no_contig_id3; +my $no_contig_id4; +my $rep_detect2; +my $hasL; +my $no_next_seed; +my $count_split; +my @firstSNP_max; +my %yuyu_option; +my %yuyu_option_back; +my $yuyu_option_A; +my $yuyu_option_C; +my $yuyu_option_T; +my $yuyu_option_G; +my $yuyu_option_A_back; +my $yuyu_option_C_back; +my $yuyu_option_T_back; +my $yuyu_option_G_back; +my $extensions_before; +my $before_shorter_skip; +my %before_shorter_skip; +my $before_shorter_skip_back; +my %before_shorter_skip_back; +my $SNR_next_seed; +my $jump_rep; +my $jump_rep_because_stuck; +my %jump_rep; +my %jump_rep_because_stuck; +my $count_stuck_in_rep; +my $jump_rep_back; +my %jump_rep_back; +my $AT_rich_before; +my $AT_rich_before_back; +my $insert_range_shorter; +my $cp_input; +my $reference; +my $merge_now; +my $best_extension_old1; +my $best_extension_old2; +my $best_extension_old3; +my $best_extension_old4; +my $count1b_tmp; +my $count2b_tmp; +my $count3b_tmp; +my $count4b_tmp; +my $overhang_check; +my %reference_next_seed; +my $reference_next_seed; +my $next_seed_ref; +my $last_150; +my $first_150; +my $PCR_free = "yes"; +my $best_extension_forward; +my $noforward_HP; +my $noback_HP; + +GetOptions ( + "c=s" => \$config, + ) or die "Incorrect usage!\n"; + +open(CONFIG, $config) or die "Error: $!\nCan't open the configuration file, please check the manual!\n"; + +my $ln ='0'; + +while (my $line = ) +{ + if ($ln eq '2') + { + $project = substr $line, 24; + chomp $project; + } + if ($ln eq '3') + { + $type = substr $line, 24; + chomp $type; + } + if ($ln eq '4') + { + $genome_range = substr $line, 24; + chomp $genome_range; + my @words = split /-/, $genome_range; + $genome_range_low = $words[0]; + $genome_range_high = $words[1]; + } + if ($ln eq '5') + { + $overlap = substr $line, 24; + chomp $overlap; + } + if ($ln eq '6') + { + $max_memory = substr $line, 24; + chomp $max_memory; + } + if ($ln eq '7') + { + $print_log = substr $line, 24; + chomp $print_log; + } + if ($ln eq '8') + { + $save_reads = substr $line, 24; + chomp $save_reads; + } + if ($ln eq '9') + { + $seed_input0 = substr $line, 24; + chomp $seed_input0; + } + if ($ln eq '10') + { + $reference = substr $line, 24; + chomp $reference; + } + if ($ln eq '11') + { + $variance_detection = substr $line, 24; + chomp $variance_detection; + } + if ($ln eq '12') + { + $heteroplasmy = substr $line, 24; + chomp $heteroplasmy; + if ($heteroplasmy ne "") + { + $use_quality = "yes"; + } + } + if ($ln eq '13') + { + $cp_input = substr $line, 24; + chomp $cp_input; + } + + if ($ln eq '17') + { + $read_length = substr $line, 24; + chomp $read_length; + } + if ($ln eq '18') + { + $insert_size = substr $line, 24; + chomp $insert_size; + } + if ($ln eq '19') + { + $platform = substr $line, 24; + chomp $platform; + } + if ($ln eq '20') + { + $paired = substr $line, 24; + chomp $paired; + } + if ($ln eq '21') + { + $reads12 = substr $line, 24; + chomp $reads12; + } + if ($ln eq '22') + { + $reads1 = substr $line, 24; + chomp $reads1; + } + if ($ln eq '23') + { + $reads2 = substr $line, 24; + chomp $reads2; + } + + if ($ln eq '27') + { + $insert_size_correct = substr $line, 24; + chomp $insert_size_correct; + } + if ($ln eq '28') + { + $insert_range_b = substr $line, 24; + chomp $insert_range_b; + } + if ($ln eq '29') + { + $insert_range_c = substr $line, 24; + chomp $insert_range_c; + } + $ln++; +} + +close CONFIG; +if ($print_log eq '1' || $print_log eq '2') +{ + $startprint2 = '0'; + $startprint = '10000000'; +} + +if ($platform eq "Ion") +{ + $platform = "ion"; +} +if ($platform eq "Illumina") +{ + $platform = "illumina"; +} +if ($paired eq "SE") +{ + $insert_size = $read_length*2; + $insert_size_correct = "no"; +} +my $output_file4 = "log_".$project.".txt"; +open(OUTPUT4, ">" .$output_file4) or die "\nCan't open file $output_file4, $!\n"; + +print "\n\n-----------------------------------------------"; +print "\nNOVOPlasty: The Organelle Assembler\n"; +print "Version 2.6.6\n"; +print "Author: Nicolas Dierckxsens, (c) 2015-2018\n"; +print "-----------------------------------------------\n\n"; + +print "\nInput parameters from the configuration file: *** Verify if everything is correct ***\n\n"; +print "Project:\n"; +print "-----------------------\n"; +print "Project name = ".$project."\n"; +print "Type = ".$type."\n"; +print "Genome range = ".$genome_range."\n"; +print "K-mer = ".$overlap."\n"; +print "Max memory = ".$max_memory."\n"; +print "Extended log = ".$print_log."\n"; +print "Save assembled reads = ".$save_reads."\n"; +print "Seed Input = ".$seed_input0."\n"; +print "Reference sequence = ".$reference."\n"; +print "Variance detection = ".$variance_detection."\n"; +print "Heteroplasmy = ".$heteroplasmy."\n"; +print "Chloroplast sequence = ".$cp_input."\n\n"; + +print "Dataset 1:\n"; +print "-----------------------\n"; +print "Read Length = ".$read_length."\n"; +print "Insert size = ".$insert_size."\n"; +print "Platform = ".$platform."\n"; +print "Single/Paired = ".$paired."\n"; +print "Combined reads = ".$reads12."\n"; +print "Forward reads = ".$reads1."\n"; +print "Reverse reads = ".$reads2."\n\n"; + +print "Optional:\n"; +print "-----------------------\n"; +print "Insert size auto = ".$insert_size_correct."\n"; +print "Insert range = ".$insert_range_b."\n"; +print "Insert range strict = ".$insert_range_c."\n\n"; + +print OUTPUT4 "\n\n-----------------------------------------------"; +print OUTPUT4 "\nNOVOPlasty: The Organelle Assembler\n"; +print OUTPUT4 "Version 2.6.6\n"; +print OUTPUT4 "Author: Nicolas Dierckxsens, (c) 2015-2018\n"; +print OUTPUT4 "-----------------------------------------------\n\n"; + +print OUTPUT4 "\nInput parameters from the configuration file: *** Verify if everything is correct ***\n\n"; +print OUTPUT4 "Project:\n"; +print OUTPUT4 "----------------------\n"; +print OUTPUT4 "Project name = ".$project."\n"; +print OUTPUT4 "Type = ".$type."\n"; +print OUTPUT4 "Genome range = ".$genome_range."\n"; +print OUTPUT4 "K-mer = ".$overlap."\n"; +print OUTPUT4 "Max memory = ".$max_memory."\n"; +print OUTPUT4 "Extended log = ".$print_log."\n"; +print OUTPUT4 "Save assembled reads = ".$save_reads."\n"; +print OUTPUT4 "Seed Input = ".$seed_input0."\n"; +print OUTPUT4 "Reference sequence = ".$reference."\n"; +print OUTPUT4 "Variance detection = ".$variance_detection."\n"; +print OUTPUT4 "Heteroplasmy = ".$heteroplasmy."\n"; +print OUTPUT4 "Chloroplast sequence = ".$cp_input."\n\n"; + +print OUTPUT4 "Dataset 1:\n"; +print OUTPUT4 "----------------------\n"; +print OUTPUT4 "Read Length = ".$read_length."\n"; +print OUTPUT4 "Insert size = ".$insert_size."\n"; +print OUTPUT4 "Platform = ".$platform."\n"; +print OUTPUT4 "Single/Paired = ".$paired."\n"; +print OUTPUT4 "Combined reads = ".$reads12."\n"; +print OUTPUT4 "Forward reads = ".$reads1."\n"; +print OUTPUT4 "Reverse reads = ".$reads2."\n\n"; + +print OUTPUT4 "Optional:\n"; +print OUTPUT4 "----------------------\n"; +print OUTPUT4 "Insert size auto = ".$insert_size_correct."\n"; +print OUTPUT4 "Insert range = ".$insert_range_b."\n"; +print OUTPUT4 "Insert range strict = ".$insert_range_c."\n\n"; + +if ($platform ne "illumina" && $platform ne "ion") +{ + die "\nPlatform has to be 'illumina' or 'ion', please check the configuration file!\n"; +} +if ($paired ne "PE" && $paired ne "SE") +{ + die "\nPaired has to be 'SE' or 'PE', please check the configuration file!\n"; +} +if ($reads12 ne "" && $reads1 ne "") +{ + die "\nYou can't give a path for a combined dataset and a forward and reverse set!\n If you have both, only use the forward and reverse path in the config file\n"; +} + +if ($variance_detection eq "yes" && $reference eq "") +{ + die "\nWhen variance detection is on, you must give a reference sequence, please check the configuration file!\n"; +} +if ($variance_detection eq "yes" && $heteroplasmy ne "") +{ + die "\nYou can't use variance and heteroplasmy detection at the same time!\n\nVariance detection should be used to compare with a reference other than itself.\n\nFor the heteroplasmy option you first need to assemble the organelle and then give this as a reference.\n"; +} +if ($save_reads ne "yes" && $save_reads ne "1" && $save_reads ne "2" && $save_reads ne "" && $save_reads ne "no") +{ + die "\n'Save assembled reads' has to be '1', '2' or empty, please check the configuration file!\n"; +} +my $USAGE = "\nUsage: perl NOVOPlasty.pl -c config_example.txt"; + + + +sub build_partial +{ + my $A = ""; + my $G = ""; + my $T = ""; + my $C = ""; + my ($str) = (@_); + my @re; + undef @re; + my $v = length($str); + my $m = '1'; + my $star = substr $str, 0,1; + + if ($star eq "*") + { + substr $str, 0,1, "."; + } + + while ($m < $v) + { + my $str9 = substr $str, $m+1; + my $str6 = substr $str, 0, $m; + my $x = '0'; + my $y = length($str6); + + while ($x < $y) + { + my $str8 = substr $str6, $x+1; + my $str7 = substr $str6, 0, $x; + $A = $str7.".".$str8.".".$str9; + push @re, $A; + $x++; + } + $m++; + } + my $re = join ('|' , @re); + qr/$re/; +} +sub build_partialb +{ + my $A = ""; + my $G = ""; + my $T = ""; + my $C = ""; + + my ($str) = (@_); + my @re; + undef @re; + my $v = length($str); + my $m = '1'; + + while ($m < $v) + { + my $str9 = substr $str, $m+1; + my $str6 = substr $str, 0, $m; + my $x = '0'; + my $y = length($str6); + + while ($x < $y) + { + my $str8 = substr $str6, $x+1; + my $str7 = substr $str6, 0, $x; + $A = $str7.".".$str8.".".$str9; + push @re, $A; + $x++; + } + $m++; + } + @re; +} +sub build_partialb_4dots +{ + my ($str) = (@_); + my $str_old = $str; + my @re; + undef @re; + my $x = '0'; + my $v = length($str); + while ($x < $v-3) + { + substr $str, $x, 1, "."; + my $x2 = $x+1; + my $str1 = $str; + while ($x2 < $v-2) + { + substr $str, $x2, 1, "."; + my $x3 = $x2+1; + my $str2 = $str; + while ($x3 < $v-1) + { + substr $str, $x3, 1, "."; + my $x4 = $x3+1; + my $str3 = $str; + while ($x4 < $v) + { + substr $str, $x4, 1, "."; + push @re, $str; + $str = $str3; + $x4++; + } + $str = $str2; + $x3++; + } + $str = $str1; + $x2++; + } + $str = $str_old; + $x++; + } + @re; +} +sub build_partial2b +{ + my $A = ""; + my (%str) = (@_); + my %re; + undef %re; + + foreach my $str (keys %str) + { + my $v = length($str)-1; + my $m = '2'; + + while ($m < $v) + { + my $str9 = substr $str, $m+1; + my $str6 = substr $str, 0, $m; + my $x = '0'; + my $y = length($str6); + + while ($x < $y) + { + my $str8 = substr $str6, $x+1; + my $str7 = substr $str6, 0, $x; + $re{$str7."A".$str8."A".$str9} = undef; + $re{$str7."A".$str8."C".$str9} = undef; + $re{$str7."A".$str8."T".$str9} = undef; + $re{$str7."A".$str8."G".$str9} = undef; + $re{$str7."C".$str8."A".$str9} = undef; + $re{$str7."C".$str8."C".$str9} = undef; + $re{$str7."C".$str8."T".$str9} = undef; + $re{$str7."C".$str8."G".$str9} = undef; + $re{$str7."T".$str8."A".$str9} = undef; + $re{$str7."T".$str8."C".$str9} = undef; + $re{$str7."T".$str8."T".$str9} = undef; + $re{$str7."T".$str8."G".$str9} = undef; + $re{$str7."G".$str8."A".$str9} = undef; + $re{$str7."G".$str8."C".$str9} = undef; + $re{$str7."G".$str8."T".$str9} = undef; + $re{$str7."G".$str8."G".$str9} = undef; + $x++; + } + $m++; + } + } + %re; +} +sub build_partial3b { + +my $A = ""; +my @str = @_; +my $str = $str[0]; +my $reverse = $str[1]; +my %re; +my %re2; +my %re3; +undef %re; +undef %re2; +undef %re3; +$str =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; +my $hasdot = $str =~ tr/\.//; +my $hasstar = $str =~ tr/\*//; + + + $re2{$str} = undef; + + if ($hasstar > 0) + { + foreach my $str (keys %re2) + { + if ($reverse eq "reverse" || $reverse eq "reverse_back") + { + my $str2 = $str; + $str2 =~ s/\*.//g; + $re3{$str2} = undef; + } + else + { + my $str2 = $str; + $str2 =~ s/.\*//g; + $re3{$str2} = undef; + } + if ($reverse eq "reverse" || $reverse eq "back") + { + $str =~ tr/\*//d; + my $temp_sre = $str; + $str = substr $temp_sre, 0, -$hasstar; + $re3{$str} = undef; + } + else + { + $str =~ tr/\*//d; + my $temp_sre = $str; + $str = substr $temp_sre, $hasstar; + $re3{$str} = undef; + } + } + } + else + { + %re3 = %re2; + } + +if ($hasdot > 0) +{ + foreach my $str (keys %re3) + { + $hasdot = $str =~ tr/\.//; + if ($hasdot eq '1') + { + if ($str =~ m/^(\w*)\.(\w*)$/) + { + my $str7 = $1; + my $str8 = $2; + + $re{$str7."A".$str8} = undef; + $re{$str7."C".$str8} = undef; + $re{$str7."T".$str8} = undef; + $re{$str7."G".$str8} = undef; + } + } + elsif ($hasdot eq '2') + { + if ($str =~ m/^(\w*)\.(\w*)\.(\w*)$/) + { + my $str7 = $1; + my $str8 = $2; + my $str9 = $3; + + $A = $str7."A".$str8."A".$str9; #two substitutions + $re{$A} = undef; + $A = $str7."A".$str8."C".$str9; + $re{$A} = undef; + $A = $str7."A".$str8."T".$str9; + $re{$A} = undef; + $A = $str7."A".$str8."G".$str9; + $re{$A} = undef; + $A = $str7."C".$str8."A".$str9; #two substitutions + $re{$A} = undef; + $A = $str7."C".$str8."C".$str9; + $re{$A} = undef; + $A = $str7."C".$str8."T".$str9; + $re{$A} = undef; + $A = $str7."C".$str8."G".$str9; + $re{$A} = undef; + $A = $str7."T".$str8."A".$str9; #two substitutions + $re{$A} = undef; + $A = $str7."T".$str8."C".$str9; + $re{$A} = undef; + $A = $str7."T".$str8."T".$str9; + $re{$A} = undef; + $A = $str7."T".$str8."G".$str9; + $re{$A} = undef; + $A = $str7."G".$str8."A".$str9; #two substitutions + $re{$A} = undef; + $A = $str7."G".$str8."C".$str9; + $re{$A} = undef; + $A = $str7."G".$str8."T".$str9; + $re{$A} = undef; + $A = $str7."G".$str8."G".$str9; + $re{$A} = undef; + } + } + elsif ($hasdot eq '3') + { + my $str1 = $str; + my $x = '0'; + + my @combi = ('A','C','T','G'); + while ($x < 4) + { + $str1 = $str; + $str1 =~ s/\./$combi[$x]/; + my $x2 = '0'; + while ($x2 < 4) + { + my $str2 = $str1; + $str2 =~ s/\./$combi[$x2]/; + my $x3 = '0'; + while ($x3 < 4) + { + my $str3 = $str2; + $str3 =~ s/\./$combi[$x3]/; + $re{$str3} = undef; + $x3++; + } + $x2++; + } + $x++; + } + } + elsif ($hasdot eq '4') + { + my $str1 = $str; + my $x = '0'; + my @combi = ('A','C','T','G'); + while ($x < 4) + { + $str1 = $str; + $str1 =~ s/\./$combi[$x]/; + my $x2 = '0'; + while ($x2 < 4) + { + my $str2 = $str1; + $str2 =~ s/\./$combi[$x2]/; + my $x3 = '0'; + while ($x3 < 4) + { + my $str3 = $str2; + $str3 =~ s/\./$combi[$x3]/; + my $x4 = '0'; + while ($x4 < 4) + { + my $str4 = $str3; + $str4 =~ s/\./$combi[$x4]/; + $re{$str4} = undef; + $x4++; + } + $x3++; + } + $x2++; + } + $x++; + } + } + elsif ($hasdot eq '5') + { + my $str1 = $str; + my $x = '0'; + my @combi = ('A','C','T','G'); + while ($x < 4) + { + $str1 = $str; + $str1 =~ s/\./$combi[$x]/; + my $x2 = '0'; + while ($x2 < 4) + { + my $str2 = $str1; + $str2 =~ s/\./$combi[$x2]/; + my $x3 = '0'; + while ($x3 < 4) + { + my $str3 = $str2; + $str3 =~ s/\./$combi[$x3]/; + my $x4 = '0'; + while ($x4 < 4) + { + my $str4 = $str3; + $str4 =~ s/\./$combi[$x4]/; + my $x5 = '0'; + while ($x5 < 4) + { + my $str5 = $str4; + $str5 =~ s/\./$combi[$x5]/; + $re{$str5} = undef; + $x5++; + } + $x4++; + } + $x3++; + } + $x2++; + } + $x++; + } + } + elsif ($hasdot eq '6') + { + my $str1 = $str; + my $x = '0'; + my @combi = ('A','C','T','G'); + while ($x < 4) + { + $str1 = $str; + $str1 =~ s/\./$combi[$x]/; + my $x2 = '0'; + while ($x2 < 4) + { + my $str2 = $str1; + $str2 =~ s/\./$combi[$x2]/; + my $x3 = '0'; + while ($x3 < 4) + { + my $str3 = $str2; + $str3 =~ s/\./$combi[$x3]/; + my $x4 = '0'; + while ($x4 < 4) + { + my $str4 = $str3; + $str4 =~ s/\./$combi[$x4]/; + my $x5 = '0'; + while ($x5 < 4) + { + my $str5 = $str4; + $str5 =~ s/\./$combi[$x5]/; + my $x6 = '0'; + while ($x6 < 4) + { + my $str6 = $str5; + $str6 =~ s/\./$combi[$x6]/; + $re{$str6} = undef; + $x6++; + } + $x5++; + } + $x4++; + } + $x3++; + } + $x2++; + } + $x++; + } + + } + elsif ($hasdot eq '7') + { + my $str1 = $str; + my $x = '0'; + my @combi = ('A','C','T','G'); + while ($x < 4) + { + $str1 = $str; + $str1 =~ s/\./$combi[$x]/; + my $x2 = '0'; + while ($x2 < 4) + { + my $str2 = $str1; + $str2 =~ s/\./$combi[$x2]/; + my $x3 = '0'; + while ($x3 < 4) + { + my $str3 = $str2; + $str3 =~ s/\./$combi[$x3]/; + my $x4 = '0'; + while ($x4 < 4) + { + my $str4 = $str3; + $str4 =~ s/\./$combi[$x4]/; + my $x5 = '0'; + while ($x5 < 4) + { + my $str5 = $str4; + $str5 =~ s/\./$combi[$x5]/; + my $x6 = '0'; + while ($x6 < 4) + { + my $str6 = $str5; + $str6 =~ s/\./$combi[$x6]/; + my $x7 = '0'; + while ($x7 < 4) + { + my $str7 = $str6; + $str7 =~ s/\./$combi[$x7]/; + $re{$str7} = undef; + $x7++; + } + $x6++; + } + $x5++; + } + $x4++; + } + $x3++; + } + $x2++; + } + $x++; + } + } + elsif ($hasdot eq '8') + { + my $str1 = $str; + my $x = '0'; + my @combi = ('A','C','T','G'); + while ($x < 4) + { + $str1 = $str; + $str1 =~ s/\./$combi[$x]/; + my $x2 = '0'; + while ($x2 < 4) + { + my $str2 = $str1; + $str2 =~ s/\./$combi[$x2]/; + my $x3 = '0'; + while ($x3 < 4) + { + my $str3 = $str2; + $str3 =~ s/\./$combi[$x3]/; + my $x4 = '0'; + while ($x4 < 4) + { + my $str4 = $str3; + $str4 =~ s/\./$combi[$x4]/; + my $x5 = '0'; + while ($x5 < 4) + { + my $str5 = $str4; + $str5 =~ s/\./$combi[$x5]/; + my $x6 = '0'; + while ($x6 < 4) + { + my $str6 = $str5; + $str6 =~ s/\./$combi[$x6]/; + my $x7 = '0'; + while ($x7 < 4) + { + my $str7 = $str6; + $str7 =~ s/\./$combi[$x7]/; + my $x8 = '0'; + while ($x8 < 4) + { + my $str8 = $str7; + $str8 =~ s/\./$combi[$x8]/; + $re{$str8} = undef; + $x8++; + } + $x7++; + } + $x6++; + } + $x5++; + } + $x4++; + } + $x3++; + } + $x2++; + } + $x++; + } + } + else + { + $re{$str} = undef; + } + } +} +else +{ + %re = %re3; +} + %re; +} +sub build_partial3c { + +my $A = ""; +my @str = @_; +my $str = $str[0]; +my $reverse = $str[1]; +my %re2; +my %re; +undef %re2; +undef %re; +my $hasstar = $str =~ tr/\*//; + + $re2{$str} = undef; + + if ($hasstar > 0) + { + foreach my $str (keys %re2) + { + if ($reverse eq "reverse" || $reverse eq "reverse_back") + { + my $str2 = $str; + $str2 =~ s/\*.//g; + $re{$str2} = undef; + } + else + { + my $str2 = $str; + $str2 =~ s/.\*//g; + $re{$str2} = undef; + } + if ($reverse eq "reverse" || $reverse eq "back") + { + $str =~ tr/\*//d; + my $temp_sre = $str; + $str = substr $temp_sre, 0, -$hasstar; + $re{$str} = undef; + } + else + { + $str =~ tr/\*//d; + my $temp_sre = $str; + $str = substr $temp_sre, $hasstar; + $re{$str} = undef; + } + } + } + else + { + %re = %re2; + } + %re; +} +sub uniq +{ + my %seen; + grep !$seen{$_}++, @_; +} +sub encrypt +{ + my @value = @_; + my $value = $value[0]; + + $value =~ s/AA/0/g; + $value =~ s/CC/1/g; + $value =~ s/TT/2/g; + $value =~ s/GG/3/g; + $value =~ s/AC/4/g; + $value =~ s/AG/5/g; + $value =~ s/AT/6/g; + $value =~ s/CT/7/g; + $value =~ s/CA/8/g; + $value =~ s/CG/9/g; + $value =~ s/TA/Z/g; + $value =~ s/TC/U/g; + $value =~ s/TG/I/g; + $value =~ s/GA/O/g; + $value =~ s/GC/P/g; + $value =~ s/GT/Q/g; + return $value; +} +sub decrypt +{ + my @value = @_; + my $value = $value[0]; + $value =~ s/Q/GT/g; + $value =~ s/P/GC/g; + $value =~ s/O/GA/g; + $value =~ s/I/TG/g; + $value =~ s/U/TC/g; + $value =~ s/Z/TA/g; + $value =~ s/9/CG/g; + $value =~ s/8/CA/g; + $value =~ s/7/CT/g; + $value =~ s/6/AT/g; + $value =~ s/5/AG/g; + $value =~ s/4/AC/g; + $value =~ s/3/GG/g; + $value =~ s/2/TT/g; + $value =~ s/1/CC/g; + $value =~ s/0/AA/g; + return $value; +} +sub IUPAC +{ + my @snps = @_; + my $A = $snps[0]; + my $C = $snps[1]; + my $T = $snps[2]; + my $G = $snps[3]; + my $iupac; + my $most; + my $h; + if ($heteroplasmy ne "") + { + $h = $heteroplasmy*($A+$C+$T+$G); + if (($A+$C+$T+$G) > 30 && $h <= 1) + { + $h = 1.1; + } + } + else + { + $h = 0.07*($A+$C+$T+$G); + } + + if ($A > $C && $A > $T && $A > $G) + { + $most = $A; + } + elsif ($C > $A && $C > $T && $C > $G) + { + $most = $C; + } + elsif ($T > $C && $T > $A && $T > $G) + { + $most = $T; + } + elsif ($G > $A && $G > $T && $G > $C) + { + $most = $G; + } + + if ($A > $h && $C > $h && $T <= $h && $G <= $h) + { + $iupac = "M"; + } + elsif ($A > $h && $G > $h && $T <= $h && $C <= $h) + { + $iupac = "R"; + } + elsif ($A > $h && $T > $h && $G <= $h && $C <= $h) + { + $iupac = "W"; + } + elsif ($C > $h && $G > $h && $A <= $h && $T <= $h) + { + $iupac = "S"; + } + elsif ($C > $h && $T > $h && $A <= $h && $G <= $h) + { + $iupac = "Y"; + } + elsif ($G > $h && $T > $h && $A <= $h && $C <= $h) + { + $iupac = "K"; + } + elsif ($A > $h && $C > $h && $G > $h && $T <= $h) + { + $iupac = "V"; + } + elsif ($A > $h && $C > $h && $T > $h && $G <= $h) + { + $iupac = "H"; + } + elsif ($A > $h && $G > $h && $T > $h && $C <= $h) + { + $iupac = "D"; + } + elsif ($C > $h && $G > $h && $T > $h && $A <= $h) + { + $iupac = "B"; + } + else + { + $iupac = "N"; + } + return ($iupac, $most); +} +sub IUPAC_reverse +{ + my @snps = @_; + my $iupac = $snps[0]; + my @nucs; + my @nucs2; + my $nuc_alt; + + if ($iupac eq 'A') + { + push @nucs, ('1'); + push @nucs2, ("A"); + $nuc_alt = "A"; + } + elsif ($iupac eq 'C') + { + push @nucs, ('2'); + push @nucs2, ("C"); + $nuc_alt = "C"; + } + elsif ($iupac eq 'T') + { + push @nucs, ('3'); + push @nucs2, ("T"); + $nuc_alt = "T"; + } + elsif ($iupac eq 'G') + { + push @nucs, ('4'); + push @nucs2, ("G"); + $nuc_alt = "G"; + } + elsif ($iupac eq "M") + { + push @nucs, ('1','2'); + push @nucs2, ("A","C"); + $nuc_alt = "A,C"; + } + elsif ($iupac eq "R") + { + push @nucs, ('1','4'); + push @nucs2, ("A","G"); + $nuc_alt = "A,G"; + } + elsif ($iupac eq "W") + { + push @nucs, ('1','3'); + push @nucs2, ("A","T"); + $nuc_alt = "A,T"; + } + elsif ($iupac eq "S") + { + push @nucs, ('4','2'); + push @nucs2, ("G","C"); + $nuc_alt = "G,C"; + } + elsif ($iupac eq "Y") + { + push @nucs, ('3','2'); + push @nucs2, ("T","C"); + $nuc_alt = "T,C"; + } + elsif ($iupac eq "K") + { + push @nucs, ('3','4'); + push @nucs2, ("T","G"); + $nuc_alt = "T,G"; + } + elsif ($iupac eq "V") + { + push @nucs, ('1','2','4'); + push @nucs2, ("A","C","G"); + $nuc_alt = "A,C,G"; + } + elsif ($iupac eq "H") + { + push @nucs, ('1','2','3'); + push @nucs2, ("A","C","T"); + $nuc_alt = "A,C,T"; + } + elsif ($iupac eq "D") + { + push @nucs, ('1','3','4'); + push @nucs2, ("A","T","G"); + $nuc_alt = "A,T,G"; + } + elsif ($iupac eq "B") + { + push @nucs, ('3','2','4'); + push @nucs2, ("T","C","G"); + $nuc_alt = "T,C,G"; + } + elsif ($iupac eq "N") + { + push @nucs, ('1','2','3','4'); + push @nucs2, ("A","C","T","G"); + $nuc_alt = "A,C,T,G"; + } + return ($nuc_alt, \@nucs); +} +sub correct +{ + my @str = @_; + my $read_correct = $str[0]; + my %rep_pairs = %{$str[1]}; + my $hp = $str[2]; + my $cc = '0'; + my $rep = ""; + + if (keys %rep_pairs > 1) + { + $rep = "yes"; + if ($y > $startprint2) + { + } + } + $cc = '0'; + my %read_part; + my %read_part_reverse; + my %read_matches; + my @read_matches; + undef %read_part; + undef %read_part_reverse; + undef %read_matches; + undef @read_matches; + if ($y > $startprint2) + { + print OUTPUT5 $read_correct."\n"; + } + while ($cc < (length($read_correct)/2)-$overlap+($overlap/2)) + { + my $read_part = substr $read_correct, $cc, $overlap; + $read_part =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + if ($hp eq "") + { + %read_part = build_partial3b $read_part; + } + else + { + %read_part = build_partial2b $read_part; + } + foreach my $read_part2 (keys %read_part) + { + if (exists($hash2b{$read_part2})) + { + my $id_part = $hash2b{$read_part2}; + $id_part = substr $id_part, 1; + my @id_part = split /,/,$id_part; + + foreach my $id_part2 (@id_part) + { + my $id_part2_end = substr $id_part2, -1 , 1, ""; + if (exists($hash{$id_part2})) + { + my @id_part2_tmp = split /,/,$hash{$id_part2}; + my $id_part2_tmp_new; + if (exists($hash{$id_part2})) + { + my @id_part2_tmp = split /,/,$hash{$id_part2}; + my $id_part2_tmp_new; + + if ($id_part2_end eq "1") + { + $id_part2_tmp_new = $id_part2_tmp[0]; + } + elsif ($id_part2_end eq "2") + { + $id_part2_tmp_new = $id_part2_tmp[1]; + } + if ($use_quality ne "") + { + $id_part2_tmp_new =~ tr/1|2|3|4/N/; + } + if ($encrypt eq "yes") + { + $id_part2_tmp_new = decrypt $id_part2_tmp_new; + } + my $id_part2_tmp_new3 = substr $id_part2_tmp_new,0, length($id_part2_tmp_new)- $cc+$left; + while (length($id_part2_tmp_new3) < length($id_part2_tmp_new)) + { + $id_part2_tmp_new3 = "N".$id_part2_tmp_new3; + } + if ($cc < $left) + { + $id_part2_tmp_new3 = substr $id_part2_tmp_new,$left-$cc, length($id_part2_tmp_new)- $cc+$left; + } + $read_matches{$id_part2_tmp_new3} = $cc; + push @read_matches, $id_part2_tmp_new3; + } + } + } + } + } + my $read_part_d = $read_part; + + $read_part_d =~ tr/ATCG/TAGC/; + my $read_part_reverse = reverse($read_part_d); + if ($hp eq "") + { + %read_part_reverse = build_partial3b $read_part_reverse; + } + else + { + %read_part_reverse = build_partial2b $read_part_reverse; + } + + foreach my $read_part2 (keys %read_part_reverse) + { + if (exists($hash2c{$read_part2})) + { + my $id_part = $hash2c{$read_part2}; + $id_part = substr $id_part, 1; + my @id_part = split /,/,$id_part; + + foreach my $id_part2 (@id_part) + { + my $id_part2_end = substr $id_part2, -1 , 1, ""; + if (exists($hash{$id_part2})) + { + my @id_part2_tmp = split /,/,$hash{$id_part2}; + my $id_part2_tmp_new; + + if ($id_part2_end eq "1") + { + $id_part2_tmp_new = $id_part2_tmp[0]; + } + elsif ($id_part2_end eq "2") + { + $id_part2_tmp_new = $id_part2_tmp[1]; + } + if ($use_quality ne "") + { + $id_part2_tmp_new =~ tr/1|2|3|4/N/; + } + if ($encrypt eq "yes") + { + $id_part2_tmp_new = decrypt $id_part2_tmp_new; + } + my $id_part2_tmp_new2b = $id_part2_tmp_new; + $id_part2_tmp_new2b =~ tr/ATCG/TAGC/; + $id_part2_tmp_new = reverse($id_part2_tmp_new2b); + my $id_part2_tmp_new3 = substr $id_part2_tmp_new,0, length($id_part2_tmp_new)- $cc+$right; + while (length($id_part2_tmp_new3) < length($id_part2_tmp_new)) + { + $id_part2_tmp_new3 = "N".$id_part2_tmp_new3; + } + if ($cc < $right) + { + $id_part2_tmp_new3 = substr $id_part2_tmp_new,$right-$cc, length($id_part2_tmp_new)- $cc+$right; + } + $read_matches{$id_part2_tmp_new3} = $cc; + push @read_matches, $id_part2_tmp_new3; + } + } + } + } + $cc++; + } + $cc -= ($overlap/2); + while ($cc < length($read_correct)-$overlap) + { + my $read_part = substr $read_correct, $cc, $overlap; + $read_part =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + if ($hp eq "") + { + %read_part = build_partial3b $read_part; + } + else + { + %read_part = build_partial2b $read_part; + } + + foreach my $read_part2 (keys %read_part) + { + if (exists($hash2c{$read_part2})) + { + my $id_part = $hash2c{$read_part2}; + $id_part = substr $id_part, 1; + my @id_part = split /,/,$id_part; + + foreach my $id_part2 (@id_part) + { + my $id_part2_end = substr $id_part2, -1 , 1, ""; + if (exists($hash{$id_part2})) + { + my @id_part2_tmp = split /,/,$hash{$id_part2}; + my $id_part2_tmp_new; + + if ($id_part2_end eq "1") + { + $id_part2_tmp_new = $id_part2_tmp[0]; + } + elsif ($id_part2_end eq "2") + { + $id_part2_tmp_new = $id_part2_tmp[1]; + } + if ($use_quality ne "") + { + $id_part2_tmp_new =~ tr/1|2|3|4/N/; + } + if ($encrypt eq "yes") + { + $id_part2_tmp_new = decrypt $id_part2_tmp_new; + } + my $id_part2_tmp_new3 = substr $id_part2_tmp_new, -$cc -$overlap-$right; + if (length($id_part2_tmp_new)-$cc-$overlap < $right) + { + $id_part2_tmp_new3 = substr $id_part2_tmp_new, -$cc -$overlap-$right, -($right-length($id_part2_tmp_new)+$cc+$overlap); + while (length($id_part2_tmp_new3) < length($id_part2_tmp_new)) + { + $id_part2_tmp_new3 = "N".$id_part2_tmp_new3; + } + } + $read_matches{$id_part2_tmp_new3} = $cc; + push @read_matches, $id_part2_tmp_new3; + } + } + } + } + my $read_part_d = $read_part; + + $read_part_d =~ tr/ATCG/TAGC/; + my $read_part_reverse = reverse($read_part_d); + if ($hp eq "") + { + %read_part_reverse = build_partial3b $read_part_reverse; + } + else + { + %read_part_reverse = build_partial2b $read_part_reverse; + } + + foreach my $read_part2 (keys %read_part_reverse) + { + if (exists($hash2b{$read_part2})) + { + my $id_part = $hash2b{$read_part2}; + $id_part = substr $id_part, 1; + my @id_part = split /,/,$id_part; + foreach my $id_part2 (@id_part) + { + my $id_part2_end = substr $id_part2, -1 , 1, ""; + if (exists($hash{$id_part2})) + { + my @id_part2_tmp = split /,/,$hash{$id_part2}; + my $id_part2_tmp_new; + + if ($id_part2_end eq "1") + { + $id_part2_tmp_new = $id_part2_tmp[0]; + } + elsif ($id_part2_end eq "2") + { + $id_part2_tmp_new = $id_part2_tmp[1]; + } + if ($use_quality ne "") + { + $id_part2_tmp_new =~ tr/1|2|3|4/N/; + } + if ($encrypt eq "yes") + { + $id_part2_tmp_new = decrypt $id_part2_tmp_new; + } + my $id_part2_tmp_new2b = $id_part2_tmp_new; + $id_part2_tmp_new2b =~ tr/ATCG/TAGC/; + $id_part2_tmp_new = reverse($id_part2_tmp_new2b); + + my $id_part2_tmp_new3 = substr $id_part2_tmp_new, -$cc -$overlap-$left; + + if (length($id_part2_tmp_new)-$cc-$overlap < $left) + { + $id_part2_tmp_new3 = substr $id_part2_tmp_new, -$cc -$overlap-$left, -($left-length($id_part2_tmp_new)+$cc+$overlap); + while (length($id_part2_tmp_new3) < length($id_part2_tmp_new)) + { + $id_part2_tmp_new3 = "N".$id_part2_tmp_new3; + } + } + $read_matches{$id_part2_tmp_new3} = $cc; + push @read_matches, $id_part2_tmp_new3; + } + } + } + } + $cc++; + } + my $count_cor = '0'; + $count_cor++ for (keys %read_matches); + if ($count_cor < 4) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nBAD_READ1\n"; + } + if ($y eq '1') + { + $bad_read = "yes"; + goto FIRST_SEED; + } + else + { + goto CORRECT_END; + } + } + + my $corrected_read = ""; + my @charso = split //, $read_correct; + my @read_matches2; + undef @read_matches2; + foreach my $extensions (@read_matches) + { + my $l = '0'; + my @chars = split //, $extensions; + my $matching = '0'; + while ($l < length($read_correct)) + { + if ($chars[$l] eq $charso[$l]) + { + $matching++; + } + $l++; + } + my $count_N = $extensions =~ tr/N/N/; + my $gg = '0.95'; + if ($hp ne "") + { + $gg = '0.80'; + } + if ($matching > $gg*(length($extensions)-$count_N) && $count_N < length($extensions)) + { + push @read_matches2, $extensions; + } + } + my $count_cor2 = '0'; + $count_cor2++ for (@read_matches2); + if ($count_cor2 < 4) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nBAD_READ2\n"; + } + if ($y eq '1') + { + $bad_read = "yes"; + goto FIRST_SEED; + } + else + { + goto CORRECT_END; + } + } + + my $l = '0'; + while ($l < length($read_correct)) + { + my $A = '0'; + my $C = '0'; + my $T = '0'; + my $G = '0'; + +POINT: foreach my $extensions (@read_matches2) + { + my @chars = split //, $extensions; + if ($chars[$l] eq "A") + { + $A++; + } + elsif ($chars[$l] eq "C") + { + $C++; + } + elsif ($chars[$l] eq "T") + { + $T++; + } + elsif ($chars[$l] eq "G") + { + $G++; + } + if (($A + $C + $T + $G) > 500) + { + last POINT; + } + } + my $tt = '3'; + my $extra; + if ($hp > 0.05) + { + $extra = '0.02'; + } + else + { + $extra = '0'; + } + if ($hp ne "") + { + $tt = $hp*($A + $C + $T + $G); + } + if ($A >= ($C + $T + $G)*1.8 && (($C <= $tt && $T <= $tt && $G <= $tt) || $hp eq "")) + { + $corrected_read = $corrected_read."A"; + } + elsif ($C >= ($A + $T + $G)*1.8 && (($A <= $tt && $T <= $tt && $G <= $tt) || $hp eq "")) + { + $corrected_read = $corrected_read."C"; + } + elsif ($T >= ($A + $C + $G)*1.8 && (($C <= $tt && $A <= $tt && $G <= $tt) || $hp eq "")) + { + $corrected_read = $corrected_read."T"; + } + elsif ($G >= ($C + $T + $A)*1.8 && (($C <= $tt && $T <= $tt && $A <= $tt) || $hp eq "")) + { + $corrected_read = $corrected_read."G"; + } + elsif (length($corrected_read) > 15 && $rep eq "yes") + { + my $last_15 = substr $corrected_read, -15; + my $count = '0'; + my $count2 = '0'; + foreach my $rep_pair (keys %rep_pairs) + { + my $check_rep_pair = $rep_pair =~ s/$last_15/$last_15/g; + if ($check_rep_pair eq '1') + { + if ($rep_pair =~ m/.*$last_15(.).*/) + { + my $nuc = $1; + if ($charso[$l] eq $nuc) + { + $count++; + } + else + { + $count2++; + } + } + } + } + if ($count > 2 && $count2 eq '0') + { + $corrected_read = $corrected_read.$charso[$l]; + } + else + { + $corrected_read = $corrected_read."."; + } + } + else + { + $corrected_read = $corrected_read."."; + } + $l++; + } + if ($y > $startprint2) + { + print OUTPUT5 $corrected_read." CORRECTED READ\n"; + } + my $dot_bad = $corrected_read =~ tr/\./\./; + my $ff = '3'; + if ($hp ne "") + { + $ff = length($corrected_read)/15; + } + while ($dot_bad > $ff && length($corrected_read) > ($overlap*2)+15) + { + my $corrected_read_tmp = substr $corrected_read, 5, -5; + $corrected_read = $corrected_read_tmp; + $dot_bad = $corrected_read =~ tr/\./\./; + $ff = length($corrected_read)/15; + } + if ($dot_bad > $ff) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nBAD_READ3\n"; + } + if ($y eq '1') + { + $bad_read = "yes2"; + goto FIRST_SEED; + } + } + + $read_correct = $corrected_read; + if ($y eq '1') + { + $read = $read_correct; + } + + $seeds_check{$id} = undef; + + delete $seed_old{$id_pair}; + $bad_read = ""; + return $read_correct; +CORRECT_END: +} +sub check_deletion +{ + my @str = @_; + my $best_extension1 = $str[0]; + my $best_extension2 = $str[1]; + my $best_extension_old1_tmp = $str[2]; + my $best_extension_old2_tmp = $str[3]; + my $ref_checking = $str[4]; + my $back = $str[5]; + my $best_extension_tmp; + my $one_or_two; + my $shorter = '0'; + + my @short; + my @long; + undef @short; + undef @long; + my $p = '0'; + my $amatch = '0'; + my $nomatch = '0'; + my $best_extension_short = ""; + my $best_extension_long = ""; + my $best_amatch = '0'; + my $best_p; + my $finish; + + if ($y > $startprint2) + { + print OUTPUT5 "CHECK_DELETION\n"; + } + my $best_extension1_tmp; + my $best_extension2_tmp; + if (length($best_extension_old1_tmp) > length($best_extension1)) + { + $best_extension1_tmp = $best_extension_old1_tmp; + } + else + { + $best_extension1_tmp = $best_extension1; + } + if (length($best_extension_old2_tmp) > length($best_extension2)) + { + $best_extension2_tmp = $best_extension_old2_tmp; + } + else + { + $best_extension2_tmp = $best_extension2; + } + if (length($best_extension1_tmp) >= length($best_extension2_tmp)) + { + $best_extension_short = $best_extension2_tmp; + $best_extension_long = $best_extension1_tmp; + $best_extension_short =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $best_extension_long =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + @short = split //, $best_extension_short; + @long = split //, $best_extension_long; + } + elsif (length($best_extension2_tmp) > length($best_extension1_tmp)) + { + $best_extension_short = $best_extension1_tmp; + $best_extension_long = $best_extension2_tmp; + $best_extension_short =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $best_extension_long =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + @short = split //, $best_extension_short; + @long = split //, $best_extension_long; + } +CHAR0:while (($p < length($best_extension_short) || $p < 15) && ($p < 25 || $ref_checking eq "yes")) + { + my $i = '1'; + $amatch = '0'; + $nomatch = '0'; +CHAR: while ($i < @short-$p) + { + if ($short[$i+$p] eq $long[$i-1]) + { + $amatch++; + } + elsif ($short[$i+$p] eq ".") + { + } + else + { + $nomatch++; + } + $i++; + } + $p++; + if (($amatch > ($nomatch*6) || ($ref_checking eq "yes" && $amatch >= ($nomatch*2))) && (($amatch > 2 && $p < 3) || $amatch > 7)) + { + if ($amatch > $best_amatch && $finish eq "") + { + $best_amatch = $amatch; + $best_p = $p; + if ($y > $startprint2) + { + print OUTPUT5 $p." DELETION_SHORT\n"; + } + next CHAR0; + } + elsif ($finish eq "") + { + next CHAR0; + } + + my $indel = substr $best_extension_short, 0, $p; + if ($back eq "back") + { + $indel =~ s/A/*A/g; + $indel =~ s/T/*T/g; + $indel =~ s/G/*G/g; + $indel =~ s/C/*C/g; + } + else + { + $indel =~ s/A/A*/g; + $indel =~ s/T/T*/g; + $indel =~ s/G/G*/g; + $indel =~ s/C/C*/g; + } + + my $after_indel; + if ($nomatch > 0) + { + my $after_indel1 = substr $best_extension_short, $p; + my @chars3 = split //, $after_indel1; + my $f = '0'; + +INDEL0: while ($f < @chars3) + { + if ($chars3[$f] eq $long[$f]) + { + $after_indel .= $chars3[$f]; + } + elsif ($f < 20) + { + $after_indel .= "."; + } + else + { + $shorter = length($best_extension_short) -$f -$p; + last INDEL0; + } + $f++; + } + } + else + { + my $after_indel1 = substr $best_extension_short, $p, $p+$i-1; + my @chars3 = split //, $after_indel1; + my $f = '0'; + +INDEL1: while ($f < @chars3) + { + if ($chars3[$f] eq $long[$f]) + { + $after_indel .= $chars3[$f]; + } + else + { + last INDEL1; + } + $f++; + } + } + $best_extension_tmp = $indel.$after_indel; + if ($y > $startprint2) + { + print OUTPUT5 $best_extension_tmp." BEST_EXTENSION\n"; + } + if ($after_indel eq "") + { + $best_extension_tmp = ""; + } + $one_or_two = "one"; + goto END_INDEL; + } + } + if ($best_amatch ne '0' && $finish eq "") + { + print OUTPUT5 $best_p." GOTOCHAR0\n"; + $p = $best_p-1; + $finish = "yes"; + goto CHAR0; + } + $p = '0'; +CHAR1:while (($p < (length($best_extension_long)-length($best_extension_short)) || $p < 15) && ($p < 25 || $ref_checking eq "yes")) + { + my $i = '1'; + $amatch = '0'; + $nomatch = '0'; +CHAR2: while ($i <= @short && $i < @long-$p) + { + if ($short[$i-1] eq $long[$i+$p]) + { + $amatch++; + } + elsif ($short[$i-1] eq ".") + { + } + else + { + $nomatch++; + } + $i++; + } + $p++; + + if (($amatch > ($nomatch*6) || ($ref_checking eq "yes" && $amatch >= ($nomatch*2))) && (($amatch > 2 && $p < 3) || $amatch > 7)) + { + if ($amatch > $best_amatch && $finish eq "") + { + $best_amatch = $amatch; + $best_p = $p; + if ($y > $startprint2) + { + print OUTPUT5 $p." DELETION_LONG\n"; + } + next CHAR1; + } + elsif ($finish eq "") + { + next CHAR1; + } + + my $indel = substr $best_extension_long,0, $p; + if ($back eq "back") + { + $indel =~ s/A/*A/g; + $indel =~ s/T/*T/g; + $indel =~ s/G/*G/g; + $indel =~ s/C/*C/g; + } + else + { + $indel =~ s/A/A*/g; + $indel =~ s/T/T*/g; + $indel =~ s/G/G*/g; + $indel =~ s/C/C*/g; + } + + my $after_indel; + if ($nomatch > 0) + { + my $after_indel1 = substr $best_extension_long, $p; + my @chars3 = split //, $after_indel1; + my $f = '0'; + +INDEL2: while ($f < @chars3 && $f < length($best_extension_short)) + { + if ($chars3[$f] eq $short[$f]) + { + $after_indel .= $chars3[$f]; + } + elsif ($f < 20) + { + $after_indel .= "."; + } + else + { + $shorter = length($best_extension_short) - $f; + last INDEL2; + } + $f++; + } + } + else + { + my $after_indel1 = substr $best_extension_long, $p, $p+$i-1;; + my @chars3 = split //, $after_indel1; + my $f = '0'; + +INDEL3: while ($f < @chars3) + { + if ($chars3[$f] eq $short[$f]) + { + $after_indel .= $chars3[$f]; + } + else + { + last INDEL3; + } + $f++; + } + } + $best_extension_tmp = $indel.$after_indel; + + if ($after_indel eq "") + { + $best_extension_tmp = ""; + } + $one_or_two = "two"; + goto END_INDEL; + } + } + if ($best_amatch ne '0' && $finish eq "") + { + print OUTPUT5 $best_p." GOTOCHAR1\n"; + $p = $best_p-1; + $finish = "yes"; + goto CHAR1; + } +END_INDEL: + return ($best_extension_tmp, $one_or_two, $shorter); +} +sub mismatch +{ + my @str = @_; + my %extensions = %{$str[0]}; + my %list = %{$str[1]}; + my $best_extension = $str[2]; + $best_extension =~ tr/K|R|Y|S|W|M|B|D|H|V/N/; + my %return; +MISMATCH1:foreach my $extensions (keys %extensions) + { + if (exists($list{$extensions})) + { + next MISMATCH1; + } + my @chars = split//, $extensions; + my @chars_best = split//, $best_extension; + my $d = '0'; + my $mismatch = '0'; + +MISMATCH2:while ($d < length($best_extension)) + { + if ($chars[$d] eq $chars_best[$d] || $chars_best[$d] eq "N" || $chars[$d] eq "N") + { + } + elsif ($chars_best[$d] eq "" || $chars[$d] eq "") + { + last MISMATCH2; + } + elsif ($mismatch < 2 || $mismatch < $d/5) + { + $mismatch++; + } + elsif (exists($return{$extensions})) + { + $return{$extensions} = $d; + } + else + { + $return{$extensions} = $d; + } + $d++; + } + } + return %return; +} +sub check_HP_pos +{ + my $check = ""; + my @check = @_; + my $ff_tmp = $check[0]; + my $pos = $check[1]; + my $pos_back = $check[2]; + my $match_pair_tmp = $check[3]; + my $ln = $check[4]; + my $one = $pos-($insert_size*$insert_range)+10; + if ($one < -$pos_back) + { + $one = -$pos_back; + } + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp > $one + $ff_tmp -10 && $pos_tmp < $one + $ff_tmp -10 +length($match_pair_tmp)) + { + my $hp_SNP_read = substr $match_pair_tmp, $pos_tmp-($one + $ff_tmp -10)-1, 1; + if ($hp_SNP_read eq $SNPs{$pos_tmp}) + { + $check = "yes"; + } + else + { + delete $accepted_SNPs{$ln}; + return $check; + } + } + } + if ($check eq "yes") + { + $accepted_SNPs_pair{$ln} = undef; + } + return $check; +} +sub check_HP_pos_back +{ + my $check = ""; + my @check = @_; + my $ff_tmp = $check[0]; + my $F = $check[1]; + my $pos_back = $check[2]; + my $match_pair_tmp = $check[3]; + my $ln = $check[4]; + my $extra = '5'; + if (length($read) < $insert_size*$insert_range && $hp_seed_assemble ne "") + { + $extra = '0'; + } + + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp >= -$pos_back + $F + $ff_tmp-($right+$extra) && $pos_tmp < -$pos_back + $F + $ff_tmp - ($right+$extra) +length($match_pair_tmp)) + { + my $hp_SNP_read = substr $match_pair_tmp, $pos_tmp - (-$pos_back + $F + $ff_tmp - ($right+$extra))-1, 1; + if ($hp_SNP_read eq $SNPs{$pos_tmp}) + { + $check = "yes"; + } + else + { + delete $accepted_SNPs_back{$ln}; + return $check; + } + } + } + if ($check eq "yes") + { + $accepted_SNPs_pair_back{$ln} = undef; + } + return $check; +} + + +my @reads_tmp = undef; + +if ($reads12 eq "") +{ + @reads_tmp = ($reads1, $reads2); + if ($reads1 eq $reads2) + { + die "\nThe two input files are identical, please check the configuration file!\n"; + } +} +else +{ + @reads_tmp = ($reads12); +} +my $output_file5 = "log_extended_".$project.".txt"; +my $output_file6 = "contigs_tmp_".$project.".txt"; +my $output_file7 = "Merged_contigs_".$project.".txt"; +my $output_file10 = "Assembled_reads_".$project."_R1.fasta"; +my $output_file11 = "Assembled_reads_".$project."_R2.fasta"; +my $output_file12 = "Variance_".$project.".vcf"; +my $output_file13 = "Heteroplasmy_".$project.".vcf"; +my $output_file14 = "Heteroplasmy_assemblies_".$project.".fasta"; + + +my $check_zip = substr $reads_tmp[0], -2; +my $check_zip2 = substr $reads_tmp[0], -3; +my $firstLine; +my $secondLine; +my $thirdLine; +my $fourthLine; +my $fifthLine; + +if ($check_zip eq "gz") +{ + open (my $FILE, '-|', 'gzip', '-dc', $reads_tmp[0]) or die "Can't open file $reads_tmp[0], $!\n"; + $firstLine = <$FILE>; + chomp $firstLine; + $secondLine = <$FILE>; + chomp $secondLine; + $thirdLine = <$FILE>; + chomp $thirdLine; + $fourthLine = <$FILE>; + chomp $fourthLine; + $fifthLine = <$FILE>; + chomp $fifthLine; + close $FILE; +} +elsif ($check_zip2 eq "bz2") +{ + open (my $FILE, '-|', 'bzip2', '-dc', $reads_tmp[0]) or die "Can't open file $reads_tmp[0], $!\n"; + $firstLine = <$FILE>; + chomp $firstLine; + $secondLine = <$FILE>; + chomp $secondLine; + $thirdLine = <$FILE>; + chomp $thirdLine; + $fourthLine = <$FILE>; + chomp $fourthLine; + $fifthLine = <$FILE>; + chomp $fifthLine; + close $FILE; +} +else +{ + open(INPUT, $reads_tmp[0]) or die "No input file found, make sure it are fastq files $!\n"; + $firstLine = ; + chomp $firstLine; + $secondLine = ; + chomp $secondLine; + $thirdLine = ; + chomp $thirdLine; + $fourthLine = ; + chomp $fourthLine; + $fifthLine = ; + chomp $fifthLine; + close INPUT; +} + +open(INPUT3, $seed_input0) or die "\nCan't open the seed file, $!\n"; +open(OUTPUT6, ">" .$output_file6) or die "\nCan't open file $output_file6, $!\n"; + +if ($print_log eq '1' || $print_log eq '2') +{ + open(OUTPUT5, ">" .$output_file5) or die "\nCan't open file $output_file5, $!\n"; +} +if ($save_reads ne "") +{ + open(OUTPUT10, ">" .$output_file10) or die "Can't open saved reads1 file $output_file10, $!\n"; + open(OUTPUT11, ">" .$output_file11) or die "Can't open saved reads2 file $output_file11, $!\n"; +} +if ($variance_detection eq "yes") +{ + open(OUTPUT12, ">" .$output_file12) or die "Can't open variance file $output_file12, $!\n"; +} +if ($heteroplasmy ne "") +{ + open(OUTPUT13, ">" .$output_file13) or die "Can't open heteroplasmy file $output_file13, $!\n"; +} +select(STDERR); +$| = 1; +select(STDOUT); # default +$| = 1; +print "\nReading Input..."; + +my $firstLine_reverse = ""; +if ($reads12 eq "") +{ + if ($check_zip eq "gz") + { + open (my $FILE, '-|', 'gzip', '-dc', $reads_tmp[1]) or die "Can't open file $reads_tmp[1], $!\n"; + $firstLine_reverse = <$FILE>; + chomp $firstLine_reverse; + $secondLine = <$FILE>; + close $FILE; + } + elsif ($check_zip2 eq "bz2") + { + open (my $FILE, '-|', 'bzip2', '-dc', $reads_tmp[1]) or die "Can't open file $reads_tmp[1], $!\n"; + $firstLine_reverse = <$FILE>; + chomp $firstLine_reverse; + $secondLine = <$FILE>; + close $FILE; + } + else + { + open(INPUT2, $reads_tmp[1]) or die "\n\nNo input file found, make sure it are fastq files $!\n"; + $firstLine_reverse = ; + chomp $firstLine_reverse; + close INPUT2; + } +} +my $no_quality_score = substr $thirdLine, 0, 1; +my $type_of_file; +my $code_before_end = substr $firstLine, -2,1; +my $code_before_end0 = substr $firstLine, -1,1; +my $SRA = ""; +if ($paired eq "SE") +{ + $type_of_file = '0'; +} +elsif (($code_before_end eq "/" || $code_before_end eq "R") && $firstLine_reverse ne $firstLine) +{ + $type_of_file = '-1'; +} +elsif ($code_before_end eq ":" && $code_before_end0 eq "1" && $firstLine_reverse ne $firstLine) +{ + $type_of_file = '-1'; +} +elsif($firstLine =~ m/.*(_|\s)(1)(:\w.*\d+:*(\s.*)*\s*\t*)$/ && $firstLine_reverse ne $firstLine) +{ + $type_of_file = "yes"; +} +elsif($firstLine =~ m/.*\s(1)(\S*)$/ && $firstLine_reverse ne $firstLine) +{ + my $firstLine_tmp = $firstLine; + my $test_space = $firstLine_tmp =~ tr/ //; + $type_of_file = -length($2)-1; + if ($test_space eq '1') + { + $type_of_file = "split"; + } +} +elsif($firstLine =~ m/.*_(1)(:N.*)$/ && $firstLine_reverse ne $firstLine) +{ + $type_of_file = -length($2)-1; +} +elsif($firstLine =~ m/\S*\.(1)(\s(\d+)\s.*)$/ && $firstLine_reverse ne $firstLine) +{ + my $test1 = $3; + if($fifthLine =~ m/\S*\.(1)(\s(\d+)(\s.*))$/ && $firstLine_reverse ne $firstLine) + { + my $test2 = $3; + if ($test2 eq $test1) + { + $type_of_file = -length($2)-1; + } + else + { + $type_of_file = -length($4); + $SRA = "yes"; + } + } +} +elsif($fifthLine =~ m/.*\.(1)(\s(.+)\s.+)$/ && $firstLine_reverse ne $firstLine) +{ + $type_of_file = -length($2)-1; +} +elsif($firstLine_reverse eq $firstLine) +{ + $type_of_file = "identical"; +} +elsif($reads12 ne "") +{ + print "\n\nCOMBINED FILE NOT SUPPORTED, PLEASE TRY SEPERATE FILES FOR THE FORWARD AND REVERSE READS!\n\n"; + print OUTPUT4 "\n\nCOMBINED FILE NOT SUPPORTED, PLEASE TRY SEPERATE FILES FOR THE FORWARD AND REVERSE READS!\n\n"; + exit; +} +else +{ + print "\n\nTHE INPUT READS HAVE AN INCORRECT FILE FORMAT!\nPLEASE SEND ME THE ID STRUCTURE!\n\n"; + print OUTPUT4 "\n\nTHE INPUT READS HAVE AN INCORRECT FILE FORMAT!\nPLEASE SEND ME THE ID STRUCTURE!\n\n"; + exit; +} + +my $last_character = substr $secondLine, -1; +my $space_at_end; +if ($last_character =~ m/\s|\t/g) +{ + $space_at_end = "yes"; +} +print "...OK\n"; + +if ($max_memory eq "") +{ + $max_memory = '15'; +} + +my %cp_ref; +my %cp_ref2; + +if ($cp_input ne "") +{ + select(STDERR); + $| = 1; + select(STDOUT); # default + $| = 1; + print "\nScan chloroplast sequence..."; + open(INPUT4, $cp_input) or die "\n\nCan't open chloroplast file $cp_input, $!\n"; + my $ff = '0'; + my $value_ref = ""; + my $ref_reverse_tmp; + + while (my $line = ) + { + if ($ff < 1) + { + $ff++; + next; + } + chomp $line; + $line =~ tr/actgn/ACTGN/; + + my $line3 = $value_ref.$line; + $ref_reverse_tmp .= $line; + + while (length($line3) > ((35*3)-1)) + { + my $value_ref2 = substr $line3, 0, 35; + my $line2 = $line3; + $line3 = substr $line2, 1; + + $cp_ref{$value_ref2} .= exists $cp_ref{$value_ref2} ? ",$ff" : $ff; + $cp_ref2{$ff} .= exists $cp_ref2{$ff} ? "$value_ref2" : $value_ref2; + $ff++; + } + $value_ref = $line3; + } + my $ref_reverse = reverse($ref_reverse_tmp); + $ref_reverse =~ tr/ACGT/TGCA/; + while (length($ref_reverse) > ((35*3)-1)) + { + my $value_ref2 = substr $ref_reverse, 0, 35; + my $line2 = $ref_reverse; + $ref_reverse = substr $line2, 1; + + $cp_ref{$value_ref2} .= exists $cp_ref{$value_ref2} ? ",$ff" : $ff; + $cp_ref2{$ff} .= exists $cp_ref2{$ff} ? "$value_ref2" : $value_ref2; + $ff++; + } + close INPUT4; + print "...OK\n"; +} +if ($variance_detection eq "yes") +{ + my ($wday, $mon, $mday, $hour, $min, $sec, $year) = localtime; + my @localtime = split / /, localtime; + my %mon2num = qw( + Jan 01 Feb 02 Mar 03 Apr 04 May 05 Jun 06 + Jul 07 Aug 08 Sep 09 Oct 10 Nov 11 Dec 12 + ); + my $month = $localtime[1]; + if (exists($mon2num{$localtime[1]})) + { + $month = $mon2num{$localtime[1]}; + } + print OUTPUT12 "##fileformat=VCFv4.0\n"; + print OUTPUT12 "##fileDate=".$localtime[4].$month.$localtime[2]."\n"; + print OUTPUT12 "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; +} +if ($heteroplasmy ne "") +{ + my ($wday, $mon, $mday, $hour, $min, $sec, $year) = localtime; + my @localtime = split / /, localtime; + my %mon2num = qw( + Jan 01 Feb 02 Mar 03 Apr 04 May 05 Jun 06 + Jul 07 Aug 08 Sep 09 Oct 10 Nov 11 Dec 12 + ); + my $month = $localtime[1]; + if (exists($mon2num{$localtime[1]})) + { + $month = $mon2num{$localtime[1]}; + } + print OUTPUT13 "##fileformat=VCFv4.0\n"; + print OUTPUT13 "##fileDate=".$localtime[4].$month.$localtime[2]."\n"; + print OUTPUT13 "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; +} +if ($type eq "chloro") +{ + $chromosome = "cp"; +} +else +{ + $chromosome = "mt"; +} + +my $si = '0'; +my $space_at_end2; + +while (my $line = ) +{ + if ($si eq 0) + { + my $last_character = substr $line, -1; + if ($last_character =~ m/\s|\t/g) + { + $space_at_end2 = "yes"; + } + } + if ($si > 0) + { + chomp($line); + if ($space_at_end2 eq "yes") + { + chop($line); + } + my $seed_input_tmp = $seed_input; + $seed_input = $seed_input_tmp.$line; + } + $si++; +} + + +my %hashref; +my %hashref2; + +if ($reference ne "") +{ + select(STDERR); + $| = 1; + select(STDOUT); # default + $| = 1; + print "\nScan reference sequence..."; + open(INPUT5, $reference) or die "\n\nCan't open reference file $reference, $!\n"; + my $ff2 = '0'; + my $value_ref2 = ""; + + while (my $line = ) + { + if ($ff2 < 1) + { + $ff2++; + next; + } + chomp $line; + $line =~ tr/actgn/ACTGN/; + my $first = substr $line, 0, 1; + my $line3; + if ($first eq '>' || $first eq '@') + { + $line3 = $value_ref2."NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN"; + } + else + { + $line3 = $value_ref2.$line; + } + + while (length($line3) > ((30*3)-1)) + { + my $value_ref2b = substr $line3, 0, 30; + my $line2 = $line3; + $line3 = substr $line2, 1; + + $hashref{$value_ref2b} .= exists $hashref{$value_ref2b} ? ",$ff2" : $ff2; + $hashref2{$ff2} .= exists $hashref2{$ff2} ? "$value_ref2b" : $value_ref2b; + + $ff2++; + } + $value_ref2 = $line3; + } + while (length($value_ref2) > 1) + { + my $value_ref2b = substr $value_ref2, 0, 30; + my $value_ref2bc = $value_ref2; + $value_ref2 = substr $value_ref2bc, 1; + + $hashref{$value_ref2b} .= exists $hashref{$value_ref2b} ? ",$ff2" : $ff2; + $hashref2{$ff2} .= exists $hashref2{$ff2} ? "$value_ref2b" : $value_ref2b; + $ff2++; + } + close INPUT5; + print "...OK\n"; + + my $seed_input_reverse = reverse($seed_input); + $seed_input_reverse =~ tr/ATCG/TAGC/; + my $d = '0'; + while ($d < length($seed_input)-30) + { + my $part = substr $seed_input, $d, 30; + my $part_reverse = substr $seed_input_reverse, $d, 30; + my $count = '0'; + my $count_reverse = '0'; + if (exists($hashref{$part})) + { + $count++; + } + if (exists($hashref{$part_reverse})) + { + $count_reverse++; + } + if ($count_reverse > $count) + { + $seed_input = $seed_input_reverse; + } + $d++; + } +} + +select(STDERR); +$| = 1; +select(STDOUT); # default +$| = 1; +print "\nBuilding Hash Table..."; + +my $file1; + +if ($reads12 eq "") +{ + $file1 = $reads1; +} +else +{ + $file1 = $reads12; +} + +my $file_count= '0'; +my $type_of_file2 = ""; +my $count_char = '0'; +my $count_hash_element = '0'; +my $out_of_memory; +my $memory_max_current; +my $all_reads = '0'; +my $count_lines = '0'; +my $count_paired = '0'; +my $skipped_reads = '0'; +my $chop = ""; +my $test = '0'; +my $keys_hash = '0'; +my %map_ids; +my $count_low_quality = '0'; + +READS_TMP: foreach my $reads_tmp (@reads_tmp) +{ + my $FILE; + if ($check_zip eq "gz") + { + open ($FILE, '-|', 'gzip', '-dc', $reads_tmp) or die "Can't open file $reads_tmp, $!\n"; + } + elsif ($check_zip2 eq "bz2") + { + open ($FILE, '-|', 'bzip2', '-dc', $reads_tmp) or die "Can't open file $reads_tmp, $!\n"; + } + else + { + open($FILE, $reads_tmp) or die "\n\nCan't open file $reads_tmp, $!\n"; + } + + my $N = '0'; + my $f = ""; + my $code = ""; + my $code_new = '1'; + my $value = ""; + my $quality = ""; + $out_of_memory = ""; + + while (my $line = <$FILE>) + { + chomp $line; + if ($file_count eq "00" && $out_of_memory eq "yes" && $reads12 eq "") + { + my $time = time; + $count_lines = `wc -l < $reads_tmp`; + print "\n".$count_lines." COUNT\n"; + print time-$time." TIME\n"; + $all_reads = '0'; + if ($no_quality_score ne "@" && $no_quality_score ne ">") + { + $all_reads += $count_lines*0.25; + } + else + { + $all_reads += $count_lines*0.5; + } + next READS_TMP; + } + elsif ($file_count eq "0" && $out_of_memory eq "yes" && $reads12 eq "") + { + if ($no_quality_score ne "@" && $no_quality_score ne ">") + { + $all_reads += 0.5; + } + else + { + $all_reads++; + } + next; + } + elsif ($file_count eq "11" && $count_paired eq keys %hash && $reads12 eq "") + { + if ($no_quality_score ne "@" && $no_quality_score ne ">") + { + $all_reads += $count_lines*0.25; + } + else + { + $all_reads += $count_lines*0.5; + } + last READS_TMP; + } + elsif ($file_count eq "1" && $count_paired eq $keys_hash && $reads12 eq "") + { + last READS_TMP; + } + if ($f eq "use_quality") + { + if ($use_quality ne "") + { + $value = $line; + } + $f = "yes"; + next; + } + if ($f eq "yes" && $no_quality_score ne "@" && $no_quality_score ne ">") + { + $f = "yes2"; + next; + } + if ($f eq "yes" && ($no_quality_score eq "@" || $no_quality_score eq ">")) + { + $code = $line; + $f = "no"; + next; + } + if ($f eq "yes2") + { + if ($use_quality ne "") + { + $f = "no"; + $quality = $line; + } + else + { + $f = ""; + next; + } + } + if ($f eq "no") + { + $all_reads++; + if ($use_quality eq "") + { + $value = $line; + } + + if ($space_at_end eq "yes") + { + chop($value); + } + + my $code2 = $code; + my $code_end = substr $code2, $type_of_file, 1; + my $code0_SRA; + if ($paired eq 'SE') + { + $code_end = '2'; + } + + if ($SRA eq "yes") + { + my $code_SRA = substr $code2, 0, $type_of_file; + my @code_SRA = split / /, $code_SRA; + $code0_SRA = $code_SRA[0]; + $code_end = substr $code0_SRA, -1, 1, ""; + } + + if ($type_of_file eq "identical") + { + if ($file_count eq '0') + { + $code_end = "1"; + } + elsif ($file_count eq '1') + { + $code_end = "2"; + } + } + if ($type_of_file eq "yes") + { + if($code2 =~ m/.*(_|\s)(1|2)(:\w.*\d+:*(\s.*)*\s*\t*)$/) + { + $code_end = $2; + $type_of_file2 = -length($3)-1; + } + } + if ($type_of_file eq "split") + { + my @split = split / /, $code2; + $code_end = substr $split[1], 0, 1; + $type_of_file2 = $split[0]; + } + if ($code_end eq "1" && $out_of_memory ne "yes" && (length($line) > ($overlap+8)*2 || length($line) > $overlap+$right+$left+10)) + { + if ($encrypt eq "yes") + { + my $value_c = $value; + $value = encrypt $value_c; + } + if ($use_quality ne "") + { + my $quality2 = $quality; + my $check_quality = $quality2 =~ tr/\!|\"|\#|\$|\%|\&|\'|\(|\)|\*|\+|\,|\-|\.|\/|0|1|2|3|4|5/O/; + if ($check_quality > 0) + { + my $offset = '0'; + my $result = index($quality2, 'O', $offset); + + while ($result != -1) + { + my $nuc = substr $value, $offset, 1; + if ($nuc eq "A") + { + substr $value, $offset, 1, "1"; + } + elsif ($nuc eq "C") + { + substr $value, $offset, 1, "2"; + } + elsif ($nuc eq "T") + { + substr $value, $offset, 1, "3"; + } + elsif ($nuc eq "G") + { + substr $value, $offset, 1, "4"; + } + $offset = $result + 1; + $result = index($quality2, 'O', $offset); + } + } + } + my $code0 = substr $code2, 0, $type_of_file; + if ($SRA eq "yes") + { + $code0 = $code0_SRA; + } + + if ($type_of_file eq "identical") + { + $code0 = $code2; + } + if ($type_of_file eq "yes") + { + $code0 = substr $code2, 0, $type_of_file2; + } + if ($type_of_file eq "split") + { + $code0 = $type_of_file2; + } + if ($memory_max_current > $max_memory && $max_memory ne "") + { + $out_of_memory = "yes"; + } + $keys_hash++; + $hash{$code0."1"} = $value; + + $count_char += length($value)*2; + $count_char += $overlap*2; + $count_hash_element++; + $memory_max_current = ((($count_hash_element/4805)*4.5) + ($count_char/1312123))/1000; + if ($save_reads eq "2") + { + $count_char += length($code0); + $memory_max_current = ((($count_hash_element/4805)*5.5) + ($count_char/1312123))/1000; + } + + if ($chop eq "") + { + my $last_character = substr $code0, -1; + if ($last_character =~ m/\s|\t/g) + { + $chop = "yes"; + } + else + { + $chop = "no"; + } + } + } + elsif ($code_end eq "1") + { + $skipped_reads++; + } + elsif ($code_end eq "2") + { + my $code1 = substr $code2, 0, $type_of_file; + if ($SRA eq "yes") + { + $code1 = $code0_SRA; + } + if ($paired eq "SE") + { + $code1 = $code2; + } + if ($type_of_file eq "identical") + { + $code1 = $code2; + } + if ($type_of_file eq "yes") + { + $code1 = substr $code2, 0, $type_of_file2; + } + if ($type_of_file eq "split") + { + $code1 = $type_of_file2; + } + + if ((exists($hash{$code1."1"}) || $paired eq "SE") && (length($line) > ($overlap+8)*2 || length($line) > $overlap+$right+$left+10)) + { + $count_paired++; + my $value2 = $hash{$code1."1"}; + delete $hash{$code1."1"}; + my $code_new1 = ($code_new*10)+1; + my $code_new2 = ($code_new*10)+2; + my $value3; + my $value4; + + if ($encrypt eq "yes") + { + $value3 = encrypt $value2; + $value4 = encrypt $value; + } + else + { + $value3 = $value2; + $value4 = $value; + } + if ($use_quality ne "") + { + $value2 =~ tr/1234/ACTG/; + my $quality2 = $quality; + my $check_quality = $quality2 =~ tr/\!|\"|\#|\$|\%|\&|\'|\(|\)|\*|\+|\,|\-|\.|\/|0|1|2|3|4|5/O/; + if ($check_quality > 0) + { + my $offset = '0'; + my $result = index($quality2, 'O', $offset); + + while ($result != -1) + { + my $nuc = substr $value4, $offset, 1; + if ($nuc eq "A") + { + substr $value4, $offset, 1, "1"; + } + elsif ($nuc eq "C") + { + substr $value4, $offset, 1, "2"; + } + elsif ($nuc eq "T") + { + substr $value4, $offset, 1, "3"; + } + elsif ($nuc eq "G") + { + substr $value4, $offset, 1, "4"; + } + $offset = $result + 1; + $result = index($quality2, 'O', $offset); + } + } + } + if ($save_reads eq "2") + { + if ($chop eq "yes") + { + chop($code1); + } + $map_ids{$code_new} = $code1; + } + + if ($paired eq "PE") + { + $hash{$code_new} = $value3; + } + $hash{$code_new} .= exists $hash{$code_new} ? ",$value4" : $value4; + + my $first = substr $value, $left, $overlap; + my $second = substr $value, -($overlap+$right), $overlap; + + $hash2b{$first} .= exists $hash2b{$first} ? ",$code_new2" : $code_new2; + $hash2c{$second} .= exists $hash2c{$second} ? ",$code_new2" : $code_new2; + + if ($paired eq "PE") + { + $first = substr $value2, $left, $overlap; + $second = substr $value2, -($overlap+$right), $overlap; + + $hash2b{$first} .= exists $hash2b{$first} ? ",$code_new1" : $code_new1; + $hash2c{$second} .= exists $hash2c{$second} ? ",$code_new1" : $code_new1; + } + $code_new++; + } + elsif ($code_end eq "2" && (exists($hash{$code1."1"}) || $paired eq "SE")) + { + delete $hash{$code1."1"}; + $count_paired++; + $skipped_reads++; + } + else + { + $skipped_reads++; + } + } + else + { + $test++; + print $code_end." LINE\n"; + } + $f = "yes"; + if ($use_quality ne "") + { + $f = ""; + } + } + else + { + $code = $line; + $f = "no"; + if ($use_quality ne "") + { + $f = "use_quality"; + } + } + } + $file_count++; + close $FILE; +} +print "...OK\n\n"; + +my $size2 = keys %hash; +my $percentage_usedb = $size2*100*2/($all_reads-$skipped_reads); +my $percentage_used = sprintf("%.2f", $percentage_usedb); +print "Subsampled fraction: ".$percentage_used." %\n"; +print OUTPUT4 "Subsampled fraction: ".$percentage_used." %\n"; +select(STDERR); +$| = 1; +select(STDOUT); # default +$| = 1; +print "\nRetrieve Seed..."; + + +HP_BACK: + +my %read1; + +FIRST_SEED: + +my $seed_input_new2; +my $seed_input_tmp = $seed_input; +my $low_coverage_check; + +if ($bad_read ne "" && keys %contigs ) +{ + $seed_input_tmp = $read; +} +if ($seed_input_tmp ne "") +{ + my $build = ""; +REF0: + my $n = '0'; + my $overlap_tmp = $overlap; + if ($build eq "yes2" && $overlap > 42) + { + $overlap_tmp = '33'; + } + $low_coverage_check = '0'; + + while ($n < length($seed_input_tmp) - $overlap_tmp) + { + my $first_seed = substr $seed_input_tmp, $n, $overlap_tmp; + my %first_seed; + undef %first_seed; + if ($build eq "yes") + { + %first_seed = build_partial2b $first_seed; + } + else + { + $first_seed{$first_seed} = undef; + } + if ($build eq "yes2" && $overlap > '42' && $n < 3000) + { + my @match1 = grep {$_ =~ /.*($first_seed).+$/} keys %hash2b; + my @match2 = grep {$_ =~ /.*($first_seed).+$/} keys %hash2c; + my @match3 = (@match1,@match2); + undef %first_seed; + foreach $first_seed (@match3) + { + $first_seed{$first_seed} = undef; + } + } +FIRST_SEED2:foreach my $first_seed (keys %first_seed) + { + if (exists($hash2b{$first_seed})) + { + my $seed_input_id2 = substr $hash2b{$first_seed}, 1; + my @seed_input_id = split /,/, $seed_input_id2; + my $seed_input_id = $seed_input_id[0]; + my $seed_input_id_tmp = substr $seed_input_id, 0, -1; + my $seed_input_id_end = substr $seed_input_id, -1; + if (exists ($bad_read{$seed_input_id_tmp})) + { + next FIRST_SEED2; + } + if (exists($hash{$seed_input_id_tmp})) + { + $low_coverage_check++; + my @seed_input_id_tmp = split /,/,$hash{$seed_input_id_tmp}; + my $seed_input_new; + if ($seed_input_id_end eq "1") + { + $seed_input_new = $seed_input_id_tmp[0]; + } + elsif ($seed_input_id_end eq "2") + { + $seed_input_new = $seed_input_id_tmp[1]; + } + if ($use_quality ne "") + { + $seed_input_new =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $seed_input_new2 = decrypt $seed_input_new; + } + else + { + $seed_input_new2 = $seed_input_new; + } + if (exists($read1{$seed_input_new2})) + { + next FIRST_SEED2; + } + my $pp = '0'; + my $pp2= '0'; + my $part2 = substr $seed_input_new2, -$overlap-25; + + while ($pp < length($part2)-$overlap) + { + my $seed_check = substr $part2, $pp, $overlap; + my $seed_check_reverse = reverse($seed_check); + $seed_check_reverse =~ tr/ACTG/TGAC/; + + if (exists($hash2b{$seed_check})) + { + $pp2++; + } + if (exists($hash2c{$seed_check})) + { + $pp2++; + } + if (exists($hash2b{$seed_check_reverse})) + { + $pp2++; + } + if (exists($hash2c{$seed_check_reverse})) + { + $pp2++; + } + if ($pp2 > 5) + { + $bad_read{$seed_input_id_tmp} = "yes"; + my %empty_hash; + my $first_read = correct ($seed_input_new2 , \%empty_hash, $heteroplasmy); + $seed{$seed_input_id} = $first_read; + $seeds_check{$seed_input_id} = undef; + $read1{$first_read} =undef; + $contig_count{$seed_input_id} = '0'; + $position{$seed_input_id} = length($seed{$seed_input_id}); + + print "...OK\n"; + print "\nInitial read retrieved successfully: ".$first_read."\n"; + print OUTPUT4 "\nInitial read retrieved successfully: ".$first_read."\n"; + print OUTPUT5 "\nInitial read retrieved successfully: ".$first_read."\n"; + if ($bad_read ne "" && keys %contigs) + { + $noback{$seed_input_id} = "stop"; + $tree{$id} = $seed_input_id; + } + else + { + $tree{"START"} = $seed_input_id; + $first_contig_id = $seed_input_id; + } + if (exists($old_id{$id})) + { + $old_id{$old_id{$id}} = $seed_input_id; + } + goto REF2; + } + $pp++; + } + } + } + if (exists($hash2c{$first_seed})) + { + $low_coverage_check++; + my $seed_input_id2 = substr $hash2c{$first_seed}, 1; + my @seed_input_id = split /,/, $seed_input_id2; + my $seed_input_id = $seed_input_id[0]; + my $seed_input_id_tmp = substr $seed_input_id, 0, -1; + my $seed_input_id_end = substr $seed_input_id, -1; + if (exists ($bad_read{$seed_input_id_tmp})) + { + next FIRST_SEED2; + } + if (exists($hash{$seed_input_id_tmp})) + { + my @seed_input_id_tmp = split /,/,$hash{$seed_input_id_tmp}; + my $seed_input_new; + if ($seed_input_id_end eq "1") + { + $seed_input_new = $seed_input_id_tmp[0]; + } + elsif ($seed_input_id_end eq "2") + { + $seed_input_new = $seed_input_id_tmp[1]; + } + if ($use_quality ne "") + { + $seed_input_new =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $seed_input_new2 = decrypt $seed_input_new; + } + else + { + $seed_input_new2 = $seed_input_new; + } + if (exists($read1{$seed_input_new2})) + { + next FIRST_SEED2; + } + my $pp = '0'; + my $pp2= '0'; + my $part2 = substr $seed_input_new2, -$overlap-25; + + while ($pp < length($part2)-$overlap) + { + my $seed_check = substr $part2, $pp, $overlap; + my $seed_check_reverse = reverse($seed_check); + $seed_check_reverse =~ tr/ACTG/TGAC/; + + if (exists($hash2b{$seed_check})) + { + $pp2++; + } + if (exists($hash2c{$seed_check})) + { + $pp2++; + } + if (exists($hash2b{$seed_check_reverse})) + { + $pp2++; + } + if (exists($hash2c{$seed_check_reverse})) + { + $pp2++; + } + if ($pp2 > 5) + { + $bad_read{$seed_input_id_tmp} = "yes"; + my %empty_hash; + my $first_read = correct ($seed_input_new2 , \%empty_hash, $heteroplasmy); + $seed{$seed_input_id} = $first_read; + $seeds_check{$seed_input_id} = undef; + $read1{$first_read} =undef; + $contig_count{$seed_input_id} = '0'; + $position{$seed_input_id} = length($seed{$seed_input_id}); + + print "...OK\n"; + print "\nInitial read retrieved successfully: ".$first_read."\n"; + print OUTPUT4 "\nInitial read retrieved successfully: ".$first_read."\n"; + print OUTPUT5 "\nInitial read retrieved successfully: ".$first_read."\n"; + if ($bad_read ne "" && keys %contigs) + { + $noback{$seed_input_id} = "stop"; + $tree{$id} = $seed_input_id; + } + else + { + $tree{"START"} = $seed_input_id; + $first_contig_id = $seed_input_id; + } + if (exists($old_id{$id})) + { + $old_id{$old_id{$id}} = $seed_input_id; + } + goto REF2; + } + $pp++; + } + } + } + } + $n++; + if ($build eq "yes2") + { + $n += 4; + } + } + if ($low_coverage_check > 10 && $bad_read ne "yes2") + { + print "\n\nCOVERAGE IS TOO LOW, SHOULD BE MORE THAN 10X\n\n"; + print OUTPUT4 "\n\nCOVERAGE IS TOO LOW, SHOULD BE MORE THAN 10X\n\n"; + exit; + } + if ($build eq "" && $bad_read ne "yes2") + { + $build = "yes"; + goto REF0; + } + elsif ($build eq "yes") + { + $build = "yes2"; + goto REF0; + } + elsif ($hp_back ne "") + { + goto HP0; + } + else + { + print "\n\nINVALID SEED, PLEASE TRY AGAIN WITH A NEW ONE\n\n"; + print OUTPUT4 "\n\nINVALID SEED, PLEASE TRY AGAIN WITH A NEW ONE\n\n"; + exit; + } +} +else +{ +REF:foreach my $line (keys %hash) + { + my $count = '0'; + my $n = '5'; + my @line_read = split /,/, $hash{$line}; + my $line_read = $line_read[0]; + + while ($n < length($line_read) - $overlap) + { + my $line_part = substr $line_read, $n, $overlap; + if (exists($hash2b{$line_part})) + { + my @results = split /,/, $hash2b{$line_part}; + foreach (@results) + { + $count++; + } + } + if (exists($hash2c{$line_part})) + { + my @results = split /,/, $hash2c{$line_part}; + foreach (@results) + { + $count++; + } + } + if ($count > 10) + { + my $line_read2 = $line_read[1]; + my $count2 = '0'; + my $n2 = '5'; + + while ($n2 < length($line_read2) - $overlap) + { + my $line_part2 = substr $line_read2, $n2, $overlap; + if (exists($hash2b{$line_part2})) + { + my @results = split /,/, $hash2b{$line_part2}; + foreach (@results) + { + $count2++; + } + } + if (exists($hash2c{$line_part2})) + { + my @results = split /,/, $hash2c{$line_part2}; + foreach (@results) + { + $count2++; + } + } + if ($count2 > 10) + { + $seed{$line."1"} = $line_read; + $seeds_check{$line."1"} = undef; + print "...OK\n"; + print "\nInitial read retrieved successfully: ".$line_read."\n"; + print OUTPUT4 "\nInitial read retrieved successfully: ".$line_read."\n"; + last REF; + } + $n2++; + } + } + $n++; + } + } +} +close INPUT3; +REF2: +my $seed = ""; +my $seed_id = ""; +my $read_short_end = ""; +my %read_short_end; +my @read_short_end; +my $read_short_end2 = ""; +my %read_short_end2; +my $read_short_zone = ""; +my $read_short_start = ""; +my %read_short_start; +my @read_short_start; +my $read_short_start2 = ""; +my %read_short_start2; +my $read_end = ""; +my @read_end; +my @read_end_chars; +my $read_end_b = ""; +my @read_end_b; +my %read_end; +my %read_end_b; +my %read_start; +my %read_start_b; +my $read_start = ""; +my $read_start_b = ""; +my @read_start; +my @read_start_b; +my @read_start_chars; +my $read_short_zone_start = ""; +my $match_pair = ""; +my $match_pair2 = ""; +my $extension = ""; +my $extension_match = ""; +my $match = ""; +my $pair = ""; +my $regex = ""; +my @match_pair; +my %merged_match; +my %merged_match1; +my %merged_match2; +my %merged_match_back; +my %merged_match_back1; +my %merged_match_back2; +my %merged_match_pos; +my %merged_match_back_pos; +my @matches; +my @matches1; +my @matches2; +my $read_new = ""; +my $id_test = ""; +my $best_extension = ""; +my $best_match2 = ""; +my $best_extension1 = ""; +my $best_extension2 = ""; +my $best_extension3 = ""; +my $best_extension4 = ""; +my $use_regex = ""; +my $use_regex_back = ""; +my %extensions; +my %extensions_original; +my %extensions1; +my %extensions2; +my %extensions1b; +my %extensions2b; +my %extensionsb; +my @extensions; +my @extensions1; +my @extensions2; +my %extensions_for_before; +my %extensions_for_before2; +my $new_best = ""; +my $extra_seed = ""; +my $position = length($seed_input_new2); +my %insert_size2; +my $position_back = '0'; +my $noback = ""; +my $noforward = ""; +my $split = ""; +my $best_extension_split; +my $merge = ""; +my $last_chance = ""; +my %last_chance; +my $last_chance_back = ""; +my %last_chance_back; +my $circle; +my $containX_short_end2 = ""; +my $contain_dot_short_end2 = ""; +my $containX_short_start2 = ""; +my $contain_dot_short_start2 = ""; +my $read_test = ""; +my $AT_rich; +my $id_old; +my $read_new1; +my $delete_first; +my $delete_second; +my $delete_third; +my $correct_after_split; +my $sc = '0'; +my $super_best_extension; +my $still_first_seed = "yes"; +my $ext_before; +my %rep_return; +my %rep_return_back; + +foreach (keys %seed) +{ + $sc++; +} +if ($bad_read eq "" || $sc eq '1') +{ + foreach $seed_id (keys %seed) + { + } + if ($y > $startprint2 && $hp_seed_assemble eq "") + { + print "\nStart Assembly...\n\n"; + print OUTPUT4 "\nStart Assembly...\n\n"; + } +} +ITERATION: while ($y < $iterations) +{ + if ($y > $startprint2) + { + print OUTPUT5 "\n".$y."\n\n"; + } + if (!%seed) + { + last ITERATION; + } + my $length_other_contig = '0'; + foreach my $contig_tmp (keys %contigs) + { + $length_other_contig += length($contigs{$contig_tmp}); + } + + $|=1; + my $progress = length($read)+$length_other_contig." bp assembled"; + + print "\b" x length($progress_before); + print ' ' x length($progress_before); + print "\b" x length($progress_before); + $progress_before = $progress; + print $progress; + + if ($still_first_seed ne "yes") + { + $still_first_seed = "yes2"; + } + else + { + $still_first_seed = ""; + } + +SEED: foreach $seed_id (keys %seed) +{ + if ($still_first_seed ne "yes2" && $hp_seed_assemble eq "") + { + my $test_first_seed; + if ($seed_id =~ m/.*_(\d+)$/) + { + $test_first_seed = $1; + } + else + { + $test_first_seed = $seed_id; + } + if ($test_first_seed ne $first_contig_id) + { + next SEED; + } + $still_first_seed = "yes"; + } + if ($benchmark_time eq "yes") + { + $time_start_seed = time; + } + $merge = ""; + $split = ""; + $circle = ""; + $AT_rich = ""; + + undef %extensions; + undef %extensions_original; + undef %extensions1; + undef %extensions2; + undef %extensions1b; + undef %extensions2b; + undef %extensions_for_before; + undef %extensions_for_before2; + undef @extensions; + undef @extensions1; + undef @extensions2; + undef @matches; + undef @matches1; + undef @matches2; + undef %merged_match; + undef %merged_match1; + undef %merged_match2; + undef %merged_match_pos; + undef %merged_match_back; + undef %merged_match_back1; + undef %merged_match_back2; + undef %merged_match_back_pos; + undef @read_end; + undef @read_end_b; + undef %read_end; + undef %read_end_b; + undef %read_start; + undef %read_start_b; + undef %read_end_tmp; + undef %read_end_b_tmp; + undef %read_short_end_tmp; + undef %read_short_zone_tmp; + undef %read_short_start_tmp; + undef %read_short_zone_start_tmp; + undef %read_start_tmp; + undef %read_start_b_tmp; + undef %SNR_length; + undef %match_rep; + undef %count_rep; + undef @read_start; + undef @read_start_b; + undef @read_end_chars; + undef @read_short_end; + undef @read_start_chars; + undef @read_short_start; + undef %filter_before1; + undef %filter_before2; + undef %filter_before3; + undef %filter_before4; + undef %filter_before1_pair; + undef %filter_before2_pair; + undef %filter_before3_pair; + undef %filter_before4_pair; + undef %allele_percentage; + undef %allele_percentage_back; + undef %allele_total; + undef %allele_total_back; + undef %A; + undef %C; + undef %T; + undef %G; + undef %accepted_SNPs; + undef %accepted_SNPs_back; + undef %accepted_SNPs_pair; + undef %accepted_SNPs_pair_back; + undef %remove_extension_mismatch; + $containX_short_end2 = '0'; + $contain_dot_short_end2 = '0'; + $containX_short_start2 = '0'; + $contain_dot_short_start2 = '0'; + $delete_first = ""; + $delete_second = ""; + $delete_third = ""; + my $merge_read = ""; + my $merge_read_pair = ""; + my $merge_read_length = '0'; + $SNR_read = ""; + $SNR_read2 = ""; + $SNR_read_back = ""; + $SNR_read_back2 = ""; + $split_forward = ""; + $deletion = ""; + $deletion_back = ""; + $reference_guided = ""; + $reference_guided_back = ""; + $repetitive_detect = ""; + $repetitive_detect_back = ""; + $repetitive_detect2 = ""; + $repetitive_detect_back2 = ""; + $contig_end = ""; + $contig_end_check = ""; + $repetitive = ""; + $before_repetitive = ""; + $before_repetitive_short = ""; + $CP_check = ""; + $before_extension1 = ""; + $before_extension2 = ""; + $before_extension_back1 = ""; + $before_extension_back2 = ""; + $id_split1 = ""; + $id_split2 = ""; + $id_split3 = ""; + $super_best_extension = ""; + $no_contig_id2 = ""; + $no_contig_id1 = ""; + $no_contig_id3 = ""; + $no_contig_id4 = ""; + $rep_detect2 = ""; + $no_next_seed = ""; + $count_split = ""; + $yuyu_option_A = ""; + $yuyu_option_C = ""; + $yuyu_option_T = ""; + $yuyu_option_G = ""; + $yuyu_option_A_back = ""; + $yuyu_option_C_back = ""; + $yuyu_option_T_back = ""; + $yuyu_option_G_back = ""; + $extensions_before = ""; + $SNR_next_seed = ""; + $ext_before = ""; + $AT_rich_before = ""; + $AT_rich_before_back = ""; + $SNR_nucleo = ""; + $SNR_nucleo_back = ""; + $insert_range_shorter = ""; + $merge_now = ""; + $use_regex = ""; + $use_regex_back = ""; + $best_extension_old1 = ""; + $best_extension_old2 = ""; + $best_extension_old3 = ""; + $best_extension_old4 = ""; + $reference_next_seed = ""; + $next_seed_ref = ""; + $ref_skip_before = ""; + $ref_skip_before_back = ""; + $best_extension_forward = ""; + $noforward_HP = ""; + $noback_HP = ""; + foreach my $SNP_tmp (keys %SNPs) + { + print OUTPUT5 $SNP_tmp." SNP_test\n"; + } + if (exists($indel_split{$seed_id})) + { + $indel_split = $indel_split{$seed_id}; + $insert_range = $insert_range_c; + if ($y > $startprint2) + { + print OUTPUT5 "\n".$indel_split." INDEL_SPLIT\n"; + } + } + else + { + $indel_split = '0'; + $insert_range = $insert_range_b; + } + if (exists($indel_split_back{$seed_id})) + { + $indel_split_back = $indel_split_back{$seed_id}; + $insert_range_back = $insert_range_c; + if ($y > $startprint2) + { + print OUTPUT5 "\n".$indel_split_back." INDEL_SPLIT_BACK\n"; + } + } + else + { + $indel_split_back = '0'; + $insert_range_back = $insert_range_b; + } + + if (exists($insert_size2{$seed_id})) + { + $insert_size = $insert_size2{$seed_id}; + } + + if (exists($seed{$seed_id})) + { + $seed = $seed{$seed_id}; + + if ($y > $startprint2) + { + print OUTPUT5 "\n".$seed_id." SEED_exists\n\n"; + print OUTPUT5 length($seed)." READ_LENGTH\n"; + } + + $id = $seed_id; + $read = $seed; + + + if (exists($noforward{$id}) || $hp_back eq "yes") + { + $noforward = "stop"; + if ($noforward{$id} eq "stop_HP") + { + $noforward_HP = "yes"; + } + } + else + { + $noforward = ""; + } + my $check_old_id = ""; + if (exists($old_id{$id})) + { + $check_old_id = "yes"; + } + if (exists($noback{$id}) || (length($read) < $insert_size && $noforward eq "" && $check_old_id eq "" && $hp_seed_assemble ne "yes2" && $hp_back ne "yes")) + { + $noback = "stop"; + if ($noback{$id} eq "stop_HP") + { + $noback_HP = "yes"; + } + } + else + { + $noback = ""; + } + if ($platform eq "ion") + { + my $SNR_end_ion = substr $read, -4; + my $SNR_check_ion = $SNR_end_ion =~ s/AAAA|CCCC|GGGG|TTTT//; + if ($SNR_check_ion > 0) + { + $SNR_read = "yes"; + $SNR_nucleo = substr $read, -1, 1; + my $SNR_end_ion2 = substr $read, -6; + my $SNR_check_ion2 = $SNR_end_ion2 =~ s/AAAAAA|CCCCCC|GGGGGG|TTTTTT//; + if ($SNR_check_ion2 > 0) + { + $read .= "*"; + } + } + my $SNR_end_ion_back = substr $read, 0, 4; + my $SNR_check_ion_back = $SNR_end_ion_back =~ s/AAAA|CCCC|GGGG|TTTT//; + if ($SNR_check_ion_back > 0) + { + $SNR_read_back = "yes"; + $SNR_nucleo_back = substr $read, 0, 1; + } + } + if ($noforward eq "") + { + my $SNR_end0 = substr $read, -20, 20; + my $SNR_end0t = substr $read, -25, 25; + $SNR_end0 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $SNR_end0t =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my @SNR_end0 = split //, $SNR_end0; + my $u0 = length($SNR_end0); + my $v0 = '1'; + my $other = ""; + my $Spn0 = '-1'; + my $count_dot = '0'; + +ALREADY_X0: while ($v0 < $u0) + { + if ($SNR_end0[$u0-$v0-1] eq $SNR_end0[$u0-$v0] || (($SNR_end0[$u0-$v0-1] eq "." || $SNR_end0[$u0-$v0] eq "." ) && $count_dot < 2)) + { + $v0++; + if(($v0 > 6 && $other eq "") || $v0 > 9) + { + $SNR_read = "yes"; + $SNR_nucleo = substr $read, $Spn0, 1; + if ($SNR_end0[$u0-1] eq $SNR_nucleo || $SNR_end0[$u0-2] eq $SNR_nucleo || $SNR_end0[$u0-3] eq $SNR_nucleo) + { + $SNR_read2 = "yes"; + } + last ALREADY_X0; + } + if ($SNR_end0[$u0-$v0] eq ".") + { + $count_dot++; + } + } + elsif($SNR_end0[$u0-$v0-1] eq "X") + { + if ($y > $startprint2) + { + print OUTPUT5 "ALREADY_X\n"; + } + $SNR_read = "X"; + last ALREADY_X0; + } + elsif(($v0 > 6 && $other eq "") || $v0 > 9) + { + $SNR_read = "yes"; + if (length($best_extension_prev{$id}) > 1) + { + $SNR_nucleo = substr $read, $Spn0, 1; + } + else + { + $SNR_nucleo = substr $read, $Spn0, 1; + } + if ($SNR_end0[$u0-1] eq $SNR_nucleo) + { + $SNR_read2 = "yes"; + } + if ($y > $startprint2) + { + print OUTPUT5 $SNR_end0[$u0-1]." ALREADY_X\n"; + } + last ALREADY_X0; + } + elsif($other eq "") + { + $other = "yes"; + if ($v0 eq '1') + { + $Spn0 = '-3'; + } + $v0++; + } + else + { + if ($platform ne "ion") + { + $SNR_read = ""; + } + last ALREADY_X0; + } + } + if ($SNR_read eq "") + { + my $SNR_check = $SNR_end0t =~ s/AAAAAAAA|CCCCCCCC|GGGGGGGG|TTTTTTTT|TATATATATATATATA//; + if ($SNR_check > 0) + { + $SNR_read = "yes"; + } + } + + foreach my $pos (keys %SNR_regions_hp) + { + if (exists($last_pos_seq_forward{$id})) + { + print OUTPUT5 $last_pos_seq_forward{$id}." DDD\n"; + if ($pos < ($last_pos_seq_forward{$id}+15) && $pos > ($last_pos_seq_forward{$id}-15)) + { + $SNR_read = "yes"; + } + } + } + } + if ($noback eq "") + { + my $SNR_end0b = substr $read, 0, 20; + my $SNR_end0bt = substr $read, 0, 25; + $SNR_end0b =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $SNR_end0bt =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my @SNR_end0b = split //, $SNR_end0b; + my $u0b = length($SNR_end0b); + my $v0b = '1'; + my $otherb = ""; + my $Spn0b = '0'; + my $count_dotb = '0'; + +ALREADY_X0b:while ($v0b < $u0b) + { + if ($SNR_end0b[$v0b-1] eq $SNR_end0b[$v0b] || (($SNR_end0b[$v0b] eq "." || $SNR_end0b[$v0b-1] eq ".") && $count_dotb < 2)) + { + $v0b++; + if(($v0b > 6 && $otherb eq "") || $v0b > 9) + { + $SNR_read_back = "yes"; + $SNR_nucleo_back = substr $read, $Spn0b, 1; + if ($SNR_end0b[0] eq $SNR_nucleo_back) + { + $SNR_read_back2 = "yes"; + } + last ALREADY_X0b; + } + if ($SNR_end0b[$v0b-1] eq ".") + { + $count_dotb++; + } + } + elsif($SNR_end0b[$v0b] eq "X") + { + if ($y > $startprint2) + { + print OUTPUT5 "ALREADY_Xb\n"; + } + $SNR_read_back = "X"; + last ALREADY_X0b; + } + elsif(($v0b > 6 && $otherb eq "") || $v0b > 9) + { + $SNR_read_back = "yes"; + if (length($best_extension_back_prev{$id}) > 1) + { + $SNR_nucleo_back = substr $read, $Spn0b, 1; + } + else + { + $SNR_nucleo_back = substr $read, $Spn0b, 1; + } + if ($SNR_end0b[0] eq $SNR_nucleo_back) + { + $SNR_read_back2 = "yes"; + } + last ALREADY_X0b; + } + elsif($otherb eq "") + { + $otherb = "yes"; + if ($v0b eq '1') + { + $Spn0b = '2'; + } + $v0b++; + } + else + { + if ($platform ne "ion") + { + $SNR_read_back = ""; + } + last ALREADY_X0b; + } + } + if ($SNR_read_back eq "") + { + my $SNR_check = $SNR_end0bt =~ s/AAAAAAAA|CCCCCCCC|GGGGGGGG|TTTTTTTT|TATATATATATATATA//; + if ($SNR_check > 0) + { + $SNR_read_back = "yes"; + } + } + + foreach my $pos (keys %SNR_regions_hp) + { + if (exists($last_pos_seq_back{$id})) + { + if ($pos < $last_pos_seq_back{$id}+15 && $pos > $last_pos_seq_back{$id}-15) + { + $SNR_read_back = "yes"; + } + } + } + } + + $contig_count = $contig_count{$id}; + + if (exists($position{$id})) + { + $position = $position{$id}; + } + if (exists($position_back{$id})) + { + $position_back = $position_back{$id}; + } + + $read_short_end2 = substr $read, -$read_length-200; + $read_short_start2 = substr $read, 0, $read_length+200; + $read_short_end2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $read_short_start2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $containX_short_end2 = $read_short_end2 =~ tr/X|\*//; + $contain_dot_short_end2 = $read_short_end2 =~ tr/\.//; + $containX_short_start2 = $read_short_start2 =~ tr/X|\*//; + $contain_dot_short_start2 = $read_short_start2 =~ tr/\.//; + + if ($paired eq "PE") + { + $read_short_end = substr $read, -($insert_size*$insert_range)+(($read_length-$overlap-8)/2), ((($insert_size*$insert_range)-$insert_size)*2)+$overlap+10+8; + $read_short_start = substr $read, ($insert_size*$insert_range_back)-(($read_length-$overlap-8)/2) - (((($insert_size*$insert_range_back)-$insert_size)*2)+$overlap+10+8), ((($insert_size*$insert_range_back)-$insert_size)*2)+$overlap+10+8; + $read_short_end =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $read_short_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + } + + if ($y eq '1' || exists($old_id2{$id}) || exists($bad_read{$id})) + { + delete $bad_read{$id}; + + $position{$id} = length($read); + $position = length($read); + $position_back = '0'; + $position_back{$id} = '0'; + delete $old_id2{$id}; + $read =~ s/\s+$//; + $read =~ s/\t+$//; + } + if ($y eq '2') + { + my $start_point = '25'; + $first_contig_start_reverse = substr $read, $start_point, $overlap; + + my $check_start = $first_contig_start_reverse =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 5; + $first_contig_start_reverse = substr $read, $start_point, $overlap; + $check_start = $first_contig_start_reverse =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + $first_contig_start_reverse = reverse($first_contig_start_reverse); + $first_contig_start_reverse =~ tr/ATCG/TAGC/; + } + + if ($y > $startprint2) + { + print OUTPUT5 $insert_size." INSERT_SIZE\n\n"; + print OUTPUT5 $position." POSITION\n"; + print OUTPUT5 $position_back." POSITION_BACK\n"; + } + if ($use_regex eq "") + { + my $read_end_dot = substr $read_short_end2, -($read_length-($overlap+$left+1-$containX_short_end2-$containX_short_end2)); + my $read_end_dot_check = '0'; + $read_end_dot_check = $read_end_dot =~ tr/\./\./; + if ($read_end_dot_check > 0) + { + $use_regex = "yes2"; + } + } + if ($use_regex_back eq "") + { + my $read_start_dot = substr $read_short_start2, 0, ($read_length-($overlap+$left+1-$containX_short_end2-$containX_short_end2)); + my $read_start_dot_check = '0'; + $read_start_dot_check = $read_start_dot =~ tr/\./\./; + if ($read_start_dot_check > 0) + { + $use_regex_back = "yes2"; + } + } + + if ($SNR_read ne "") + { + if ($y > $startprint2) + { + print OUTPUT5 $SNR_nucleo." SNR_READ\n"; + } + } + if ($SNR_read2 ne "") + { + if ($y > $startprint2) + { + print OUTPUT5 "SNR_READ2\n"; + } + } + if ($SNR_read_back ne "") + { + if ($y > $startprint2) + { + print OUTPUT5 "SNR_BACK_READ\n"; + } + } + if (exists($regex{$id})) + { + $use_regex = $regex{$id}; + } + elsif ($use_regex ne "yes2") + { + $use_regex = ""; + } + if (exists($before{$id})) + { + $before = $before{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "BEFORE\n"; + } + } + else + { + $before = ""; + } + if (exists($before_back{$id})) + { + $before_back = $before_back{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "BEFORE_BACK\n"; + } + } + else + { + $before_back = ""; + } + if (exists($regex_back{$id})) + { + $use_regex_back = $regex_back{$id}; + } + elsif ($use_regex_back ne "yes2") + { + $use_regex_back = ""; + } + if (exists($last_chance{$id}) || (length($read) < $insert_size && $hp_seed_assemble_last_chance ne "yes")) + { + $last_chance = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "LAST_CHANCE\n"; + } + } + else + { + $last_chance = ""; + } + if (exists($last_chance_back{$id}) || (($position_back < 225 || $position_back < $insert_size - $read_length + 300) && $noback ne "stop" && $noforward eq "stop" && $hp_seed_assemble_last_chance ne "yes")) + { + $last_chance_back = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "LAST_CHANCE_BACK\n"; + } + } + else + { + $last_chance_back = ""; + } + if (exists($indel_split_skip{$id})) + { + $indel_split_skip = $indel_split_skip{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "INDEL_SPLIT_SKIP\n"; + } + } + else + { + $indel_split_skip = ""; + } + if (exists($indel_split_skip_back{$id})) + { + $indel_split_skip_back = $indel_split_skip_back{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "INDEL_SPLIT_SKIP_BACK\n"; + } + } + else + { + $indel_split_skip_back = ""; + } + if (exists($SNP_active{$id})) + { + $SNP_active = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "SNP_ACTIVE\n"; + } + } + else + { + $SNP_active = ""; + } + if (exists($SNP_active_back{$id})) + { + $SNP_active_back = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "SNP_ACTIVE_BACK\n"; + } + } + else + { + $SNP_active_back = ""; + } + if (exists($nosecond{$id})) + { + $nosecond = "yes"; + } + else + { + $nosecond = ""; + } + if (exists($before_shorter_skip{$id})) + { + $before_shorter_skip = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "BEFORE SHORTER\n"; + } + } + else + { + $before_shorter_skip = ""; + } + if (exists($before_shorter_skip_back{$id})) + { + $before_shorter_skip_back = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "BEFORE SHORTER BACK\n"; + } + } + else + { + $before_shorter_skip_back = ""; + } + if (exists($jump_rep{$id})) + { + $jump_rep = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "JUMP REP\n"; + } + } + else + { + $jump_rep = ""; + } + if (exists($jump_rep_because_stuck{$id})) + { + $jump_rep_because_stuck = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "JUMP REP BECAUSE STUCK\n"; + } + } + else + { + $jump_rep_because_stuck = ""; + } + if (exists($jump_rep_back{$id})) + { + $jump_rep_back = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "JUMP REP BACK\n"; + } + } + else + { + $jump_rep_back = ""; + } + if (exists($reference_next_seed{$id})) + { + $reference_next_seed = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "REF NEXT SEED\n"; + } + delete $reference_next_seed{$id}; + } + else + { + $reference_next_seed = ""; + } + + if (exists($no_next_seed{$id})) + { + $no_next_seed = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "NO NEXT SEED\n"; + } + delete $no_next_seed{$id}; + } + else + { + $no_next_seed = ""; + } + + if ($last_150 ne "" && length($read) > $last_150+130) + { + $noforward{$id} = "stop"; + $noforward = "stop"; + $no_next_seed = "yes"; + } + if ($first_150 ne "" && length($read) > $first_150+200) + { + $first_150 = ""; + } + + if (exists($yuyu_option{$id.'A'})) + { + $yuyu_option_A = "A"; + } + if (exists($yuyu_option{$id.'C'})) + { + $yuyu_option_C = "C"; + } + if (exists($yuyu_option{$id.'G'})) + { + $yuyu_option_G = "G"; + } + if (exists($yuyu_option{$id.'T'})) + { + $yuyu_option_T = "T"; + } + + if (exists($yuyu_option_back{$id.'A'})) + { + $yuyu_option_A_back = "A"; + } + if (exists($yuyu_option_back{$id.'C'})) + { + $yuyu_option_C_back = "C"; + } + if (exists($yuyu_option_back{$id.'G'})) + { + $yuyu_option_G_back = "G"; + } + if (exists($yuyu_option_back{$id.'T'})) + { + $yuyu_option_T_back = "T"; + } + + if ($noback ne "stop") + { + my $start_repetitive = substr $read, $overlap, $insert_size+100; + my $start_repetitiveb = substr $read, 200, 5000; + $start_repetitive =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $start_repetitiveb =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $repetitive_test = substr $read_short_start2, 0, 15; + my $repetitive_testb = substr $read_short_start2, 0, 200; + my $SNR_skip = "yes"; + my $u = '0'; + while ($SNR_skip eq "yes") + { + $repetitive_test = substr $read_short_start2, $u, 15; + my $SNR_checkA = $repetitive_test =~ tr/A/A/; + my $SNR_checkC = $repetitive_test =~ tr/C/C/; + my $SNR_checkT = $repetitive_test =~ tr/T/T/; + my $SNR_checkG = $repetitive_test =~ tr/G/G/; + my $SNR_checkdot = $repetitive_test =~ tr/\./\./; + if ($SNR_checkA+$SNR_checkdot > 12 || $SNR_checkC+$SNR_checkdot > 12 || $SNR_checkG+$SNR_checkdot > 12 || $SNR_checkT+$SNR_checkdot > 12) + { + $SNR_skip = "yes"; + $u += 10; + } + else + { + $SNR_skip = ""; + } + } + $repetitive_test =~ tr/\*//d; + $repetitive_testb =~ tr/\*//d; + $start_repetitiveb =~ tr/\*//d; + my $check_repetitive = $start_repetitive =~ s/$repetitive_test/$repetitive_test/g; + my $check_repetitiveb = $start_repetitiveb =~ s/$repetitive_testb/$repetitive_testb/g; + + if (exists ($rep_return_back{$id})) + { + $repetitive_detect_back = "yes2"; + if ($rep_return_back{$id} < $position_back-(2*$read_length)) + { + delete $rep_return_back{$id}; + } + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_BACK_RETURN\n"; + print OUTPUT5 $start_repetitive." START_READ\n"; + } + } + if ($check_repetitive > 1 || $check_repetitiveb > 0) + { + if (exists ($rep_return_back{$id})) + { + } + elsif (length($read) > $insert_size+200) + { + substr $read, 0, ($read_length+50), ""; + $seed{$id} = $read; + $seeds_check{$id} = undef; + $position_back{$id} = $position_back-($read_length+50); + $rep_return_back{$id} = $position_back; + if ($y > $startprint2) + { + print OUTPUT5 "RETURN_REPETITIVE_BACK\n"; + } + goto SEED; + } + $repetitive_detect_back = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_back\n"; + print OUTPUT5 $start_repetitive." START_READ\n"; + } + + my $start_repetitive1 = substr $read, $insert_size+50, 300; + my $check_repetitive1= $start_repetitive1 =~ s/$repetitive_test/$repetitive_test/g; + + if ($check_repetitive1 > 1 || $check_repetitiveb > 1) + { + $repetitive_detect_back2 = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_back2\n"; + } + my $repetitive_test_stop = substr $read_short_start2, 0, 30; + my $start_repetitive_stop = substr $read, 0, $insert_size+550; + my $start_repetitive_stopb = substr $read, 200, 8000; + my $check_repetitive_stop = $start_repetitive_stop =~ s/$repetitive_test_stop/$repetitive_test_stop/g; + my $check_repetitive_stopb = $start_repetitive_stopb =~ s/$repetitive_testb/$repetitive_testb/g; + if ($check_repetitive_stop > 3 || $check_repetitive_stopb > 2) + { + $noback = "stop"; + $noback{$id} = "stop"; + $read = substr $read, $read_length; + if ($y > $startprint2) + { + print OUTPUT5 "STUCK_IN_REP_BACK\n"; + } + } + } + } + } + if ($noforward ne "stop") + { + my $end_repetitive = substr $read, -$insert_size-100,-$overlap; + my $end_repetitiveb = substr $read, -5000,-200; + $end_repetitive =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_repetitiveb =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $repetitive_test2 = substr $read_short_end2, -15; + my $repetitive_test2b = substr $read_short_end2, -200; + my $SNR_skip = "yes"; + my $u = '15'; + while ($SNR_skip eq "yes") + { + $repetitive_test2 = substr $read_short_end2, -$u, 15; + my $SNR_checkA = $repetitive_test2 =~ tr/A/A/; + my $SNR_checkC = $repetitive_test2 =~ tr/C/C/; + my $SNR_checkT = $repetitive_test2 =~ tr/T/T/; + my $SNR_checkG = $repetitive_test2 =~ tr/G/G/; + my $SNR_checkdot = $repetitive_test2 =~ tr/\./\./; + if ($SNR_checkA+$SNR_checkdot > 12 || $SNR_checkC+$SNR_checkdot > 12 || $SNR_checkG+$SNR_checkdot > 12 || $SNR_checkT+$SNR_checkdot > 12) + { + $SNR_skip = "yes"; + $u += 10; + } + else + { + $SNR_skip = ""; + } + } + $repetitive_test2 =~ tr/\*//d; + $repetitive_test2b =~ tr/\*//d; + $end_repetitiveb =~ tr/\*//d; + + my $check_repetitive2 = $end_repetitive =~ s/$repetitive_test2/$repetitive_test2/g; + my $check_repetitive2b = $end_repetitiveb =~ s/$repetitive_test2b/$repetitive_test2b/g; + + if (exists ($rep_return{$id})) + { + $repetitive_detect = "yes2"; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_RETURN\n"; + print OUTPUT5 $end_repetitive." END_READ\n"; + } + if ($rep_return{$id} < $position-(2*$read_length)) + { + delete $rep_return{$id}; + } + } + if ($check_repetitive2 > 1 || $check_repetitive2b > 0) + { + if (exists ($rep_return{$id})) + { + } + elsif (length($read) > $insert_size+200) + { + substr $read, -($read_length+50), ($read_length+50), ""; + $seed{$id} = $read; + $seeds_check{$id} = undef; + $position{$id} = $position-($read_length+50); + $rep_return{$id} = $position; + if ($y > $startprint2) + { + print OUTPUT5 "RETURN_REPETITIVE\n"; + } + goto SEED; + } + $repetitive_detect = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE\n"; + print OUTPUT5 $end_repetitive." END_READ\n"; + } + + my $end_repetitive1 = substr $read, -$insert_size-350, 300; + my $check_repetitive21 = $end_repetitive1 =~ s/$repetitive_test2/$repetitive_test2/g; + + if ($check_repetitive21 > 1 || $check_repetitive2b > 1) + { + $repetitive_detect2 = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE2\n"; + } + my $repetitive_test_stop = substr $read_short_end2, -30; + my $end_repetitive_stopb = substr $read, -8000,-200; + my $check_repetitive_stopb = $end_repetitive_stopb =~ s/$repetitive_test2b/$repetitive_test2b/g; + if ($check_repetitive_stopb > 2 && ($jump_rep_because_stuck ne "yes" || $count_stuck_in_rep > 9)) + { + $jump_rep_because_stuck = "yes"; + $jump_rep_because_stuck{$id} = "yes"; + } + elsif ($check_repetitive_stopb > 2) + { + $noforward = "stop"; + $noforward{$id} = "stop"; + $no_next_seed = "yes"; + $no_next_seed{$id} = "yes"; + $read = substr $read, 0, -$read_length; + if ($y > $startprint2) + { + print OUTPUT5 "STUCK_IN_REP\n"; + } + } + } + } + } + if ($y > $startprint2 && $benchmark_time eq "yes") + { + $time_before_merge = time; + if ($time_before_merge - $time_start_seed > 1) + { + print OUTPUT5 $time_before_merge - $time_start_seed." TIME0\n"; + } + } +MERGE: + my $merge_extra = '0'; + if ($y > $startprint2) + { + if (exists($old_id{$id})) + { + print OUTPUT5 $old_id{$id}." OLD_ID\n"; + } + if (exists($old_rep{$id})) + { + print OUTPUT5 "OLD_REP\n"; + } + } + if (exists $old_id{$id} && exists $old_rep{$id}) + { + if ($position_back > $read_length && $position_back < ($insert_size*3) && $repetitive_detect_back eq "" && $noback ne "stop") + { + my $read_oldie = $seed_old{$old_id{$id}}; + my $read_newest = $read; + + my $start_seq = substr $read, 0, $insert_size*3; + my $start_seq1 = substr $read, 0, 39; + my $start_seq2 = substr $read, 30, 39; + my $end_seq = substr $read_oldie, -$insert_size*3; + $merge_read_length = length ($read); + $start_seq =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $start_seq1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $start_seq2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + if ($end_seq =~ m/.*.$start_seq1(.*)$/) + { + my $r = length($1); + my $read_temp = $read; + my $read1 = substr $read_oldie, 0, -39-$r; + + $read = $read1.$read_temp; + $merge_read = "yes"; + } + elsif ($end_seq =~ m/.*.$start_seq2(.*)$/) + { + my $r = length($1); + my $read_temp = $read; + my $read1 = substr $read_oldie, 0, -39-$r; + my $read2 = substr $read_temp, 30; + + $read = $read1.$read2; + $merge_read = "yes"; + } + if ($merge_read eq "yes") + { + $seed{$id} = $read; + $seeds_check{$id} = undef; + delete $tree{$old_id{$id}}; + + foreach my $tree_tmp (keys %tree) + { + my $old = $old_id{$id}; + my $tree2 = $tree{$tree_tmp}; + my $tree3 = $tree{$tree_tmp}; + + if ($old_id{$id} =~ m/.*_(\d+)$/) + { + $old = $1; + } + my $id_tmp = $id; + if ($id =~ m/.*_(\d+)$/) + { + $id_tmp = $1; + } + my @ids_split = split /\*/, $tree2; + foreach my $id_split (@ids_split) + { + if ($id_split =~ m/^$old(REP)*$/) + { + if ($tree2 =~ m/^(.*\*)*$old(REP)*(\*.*)*$/) + { + if (defined($1)) + { + $tree3 = $1.$id_tmp; + } + else + { + $tree3 = $id_tmp; + } + if (defined($2)) + { + $tree3 = $tree3."REP"; + } + if (defined($3)) + { + $tree3 = $tree3.$3; + } + } + } + } + delete $tree{$tree_tmp}; + $tree{$tree_tmp} = $tree3; + foreach my $contigs_end (keys %contigs_end) + { + if ($contigs_end{$contigs_end} eq $old) + { + delete $contigs_end{$contigs_end}; + $contigs_end{$contigs_end} = $id_tmp; + } + } + } + delete $old_id{$id}; + delete $old_rep{$id}; + delete $old_rep_old{$old_id{$id}}; + if ($y > $startprint2) + { + print OUTPUT5 "Merged both contigs rep!\n"; + print OUTPUT5 ">".$read_newest."\n"; + } + $noback = "stop"; + $noback{$id} = "stop"; + $merge_extra = $position; + } + } + elsif ((($noback eq "stop" || $position_back >= ($insert_size*3)) && $position > $insert_size+200) || $merge_now ne "") + { + my $id_tmp = $id; + if ($id =~ m/.*_(\d+)$/) + { + $id_tmp = $1; + } + $tree{$old_id{$id}} = $id_tmp."REP"; + $noback{$id} = "stop"; + $noback = "stop"; + $merge_extra = $position; + print OUTPUT5 ">Remove rep merge!\n"; + + if ($contig_num eq '1') + { + $contigs{$contig_num."+".$old_id{$id}} = $seed_old{$old_id{$id}}; + my $start_point = '500'; + my $check_repetitive = '3'; + + while ($check_repetitive > 2) + { + my $repetitive = substr $seed_old{$old_id{$id}}, $start_point, 15; + $repetitive =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + my $read_short_area = substr $seed_old{$old_id{$id}}, $start_point -170, 340; + $check_repetitive = $read_short_area =~ s/$repetitive/$repetitive/g; + if ($check_repetitive > 2) + { + $start_point += 20; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_IN_START_SEQUENCE\n"; + } + } + } + $first_contig_start = substr $seed_old{$old_id{$id}}, $start_point, $overlap; + my $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 10; + $first_contig_start = substr $seed_old{$old_id{$id}}, $start_point, $overlap; + $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + if ($y > $startprint2) + { + print OUTPUT5 $first_contig_start." CONTIG_START2\n"; + } + foreach my $seedie (keys %seed) + { + my $seedie_part = substr $seed{$seedie}, 0, 1000; + my $check_seedie = $seedie_part =~ s/$first_contig_start/$first_contig_start/; + if ($check_seedie > 0) + { + delete $seed{$seedie}; + } + } + } + else + { + $contigs{$contig_num."+".$old_id{$id}} = $seed_old{$old_id{$id}}; + } + $contig_num++; + delete $old_id{$id}; + delete $old_rep{$id}; + delete $old_rep_old{$old_id{$id}}; + if ($noback eq "stop" && $noforward eq "stop" && $merge_now eq "yes2") + { + $contigs{$contig_num."+".$id} = $read; + $contig_num++; + } + } + } + else + { + if (exists $old_id{$id} && $position_back > 25 && $position_back < ($insert_size*3) && $noback ne "stop") + { + my $read_oldie = $seed_old{$old_id{$id}}; + my $read_newest = $read; + + my $start_seq = substr $read, 0, $insert_size+200; + my $start_seq1 = substr $read, 0, 39; + my $start_seq2 = substr $read, 30, 39; + my $end_seq = substr $read_oldie, -$insert_size-200; + my $end_seq1 = substr $read_oldie, -39, 39; + my $end_seq2 = substr $read_oldie, -72, 39; + $merge_read_length = length ($read); + $start_seq =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $start_seq1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $start_seq2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + + if ($end_seq =~ m/.*.$start_seq1(.*)$/) + { + my $r = length($1); + my $read_temp = $read; + my $read1 = substr $read_oldie, 0, -39-$r; + + $read = $read1.$read_temp; + $merge_read = "yes"; + } + elsif ($end_seq =~ m/.*.$start_seq2(.*)$/) + { + my $r = length($1); + my $read_temp = $read; + my $read1 = substr $read_oldie, 0, -39-$r; + my $read2 = substr $read_temp, 30; + + $read = $read1.$read2; + $merge_read = "yes"; + } + elsif ($start_seq =~ m/(.*.)$end_seq1.*$/) + { + my $r = length($1); + my $read_temp = $read; + my $read1 = substr $read_oldie, 0, -39-$r; + + $read = $read1.$read_temp; + $merge_read = "yes"; + } + elsif ($start_seq =~ m/(.*.)$end_seq2.*$/) + { + my $r = length($1); + my $read_temp = $read; + my $read1 = substr $read_oldie, 0, -39-$r; + my $read2 = substr $read_temp, 30; + + $read = $read1.$read2; + $merge_read = "yes"; + } + + if ($merge_read eq "yes") + { + $seed{$id} = $read; + $seeds_check{$id} = undef; + delete $tree{$old_id{$id}}; + foreach my $tree_tmp (keys %tree) + { + my $old = $old_id{$id}; + my $tree2 = $tree{$tree_tmp}; + my $tree3 = $tree{$tree_tmp}; + if ($old_id{$id} =~ m/.*_(\d+)$/) + { + $old = $1; + } + my $id_tmp = $id; + if ($id =~ m/.*_(\d+)$/) + { + $id_tmp = $1; + } + my @ids_split = split /\*/, $tree2; + foreach my $id_split (@ids_split) + { + if ($id_split =~ m/^$old(REP)*$/) + { + if ($tree2 =~ m/^(.*\*)*$old(REP)*(\*.*)*$/) + { + if (defined($1)) + { + $tree3 = $1.$id_tmp; + } + else + { + $tree3 = $id_tmp; + } + if (defined($2)) + { + $tree3 = $tree3."REP"; + } + if (defined($3)) + { + $tree3 = $tree3.$3; + } + } + } + } + delete $tree{$tree_tmp}; + $tree{$tree_tmp} = $tree3; + foreach my $contigs_end (keys %contigs_end) + { + if ($contigs_end{$contigs_end} eq $old) + { + delete $contigs_end{$contigs_end}; + $contigs_end{$contigs_end} = $id_tmp; + } + } + } + delete $old_id{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "Merged both contigs!\n"; + print OUTPUT5 ">".$read_newest."\n"; + } + $noback = "stop"; + $noback{$id} = "stop"; + $merge_extra = $position; + } + } + if (((exists $old_id{$id} && ($noback eq "stop" || $position_back >= ($insert_size*3)) && $position > 1000) || $merge_now ne "") && $merge_read eq "") + { + $merge_read_length = length($read); + $merge_read = "yes"; + $read = $seed_old{$old_id{$id}} ."LLLLLLLLLLLLLLL".$read; + $seed{$id} = $read; + $seeds_check{$id} = undef; + $hasL = "yes"; + foreach my $tree_tmp (keys %tree) + { + print OUTPUT5 $tree_tmp." TREE_TMP\n"; + print OUTPUT5 $tree{$tree_tmp}." TREE_TMP2\n"; + my $old = $old_id{$id}; + my $tree2 = $tree{$tree_tmp}; + my $tree3 = $tree{$tree_tmp}; + if ($old_id{$id} =~ m/.*_(\d+)$/) + { + $old = $1; + } + my $id_tmp = $id; + if ($id =~ m/.*_(\d+)$/) + { + $id_tmp = $1; + } + my @ids_split = split /\*/, $tree2; + foreach my $id_split (@ids_split) + { + print OUTPUT5 $id_split." ID_SPLIT\n"; + if ($id_split =~ m/^.*$old(REP)*$/) + { + print OUTPUT5 $tree2." ID_SPLIT2\n"; + if ($tree2 =~ m/^(.*\*)*$old(REP)*(\*.*)*$/) + { + if (defined($1)) + { + $tree3 = $1.$id_tmp; + } + else + { + $tree3 = $id_tmp; + } + if (defined($2)) + { + $tree3 = $tree3."REP"; + } + if (defined($3)) + { + $tree3 = $tree3.$3; + } + print OUTPUT5 $tree3." TREE3\n"; + } + } + } + delete $tree{$tree_tmp}; + $tree{$tree_tmp} = $tree3; + foreach my $contigs_end (keys %contigs_end) + { + if ($contigs_end{$contigs_end} eq $old) + { + delete $contigs_end{$contigs_end}; + $contigs_end{$contigs_end} = $id_tmp; + } + } + } + + if ($contig_num eq '1') + { + my $start_point = '500'; + my $check_repetitive = '3'; + + while ($check_repetitive > 2) + { + my $repetitive = substr $read, $start_point, 15; + $repetitive =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + my $read_short_area = substr $read, $start_point -170, 340; + $check_repetitive = $read_short_area =~ s/$repetitive/$repetitive/g; + if ($check_repetitive > 2) + { + $start_point += 20; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_IN_START_SEQUENCE\n"; + } + } + } + $first_contig_start = substr $read, $start_point, $overlap; + my $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 10; + $first_contig_start = substr $read, $start_point, $overlap; + $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + if ($y > $startprint2) + { + print OUTPUT5 $first_contig_start." CONTIG_START2\n"; + } + foreach my $seedie (keys %seed) + { + my $seedie_part = substr $seed{$seedie}, 0, 1000; + my $check_seedie = $seedie_part =~ s/$first_contig_start/$first_contig_start/; + if ($check_seedie > 0) + { + delete $seed{$seedie}; + } + } + } + + delete $old_id{$id}; + $noback{$id} = "stop"; + $noback = "stop"; + $merge_extra = $position; + if ($y > $startprint2) + { + print OUTPUT5 ">Merged contigs with LLLLLLLLLLL!\n"; + } + $contig_gap_min{$id."_".$contig_count} = ($contig_gap_min{$id."_".$contig_count}-$position_back); + $contig_gap_max{$id."_".$contig_count} = ($contig_gap_max{$id."_".$contig_count}-$position_back); + if ($noback eq "stop" && $noforward eq "stop" && $merge_now eq "yes2") + { + $contigs{$contig_num."+".$id} = $read; + $contig_num++; + } + } + } + if ($merge_now ne "") + { + if ($noforward eq "stop") + { + print OUTPUT5 ">MERGE_NOW ".$id."\n"; + print OUTPUT5 $read."\n"; + delete $seed{$id}; + if (!keys %seed) + { + $circle = "contigs"; + goto FINISH; + } + else + { + goto ITERATION; + } + } + else + { + print OUTPUT5 ">\n"; + $noback = "stop"; + $noback{$id} = "stop"; + $seed_split{$id} = undef; + $best_extension = ""; + goto FINISH; + } + } + + + if (exists $old_id{$id}) + { + + } + elsif (keys %contigs && $first_contig_start ne "") + { + my %read_short_end2; + if ($merge_extra ne "") + { + my $long_end = substr $read, -250-$merge_extra; + %read_short_end2 = build_partial3b $long_end; + } + else + { + %read_short_end2 = build_partial3b $read_short_end2; + } + foreach my $read_short_end2_tmp (keys %read_short_end2) + { + my $check_start0 = '0'; + if ($merge_extra ne "") + { + $check_start0 = $read_short_end2_tmp =~ s/$first_contig_start/$first_contig_start/; + } + + my $check_start1 = $read_short_end2_tmp =~ s/$first_contig_start/$first_contig_start/; + my $check_start2 = $read_short_end2_tmp =~ s/$first_contig_start_reverse/$first_contig_start_reverse/; + if ($check_start1 > 0 || $check_start0 > 0) + { + $tree{$id} = "END"; + } + if ($check_start2 > 0) + { + $tree{$id} = "END_REVERSE"; + } + if ($check_start1 > 0 || $check_start2 > 0 || $check_start0 > 0) + { + $noforward{$id} = "stop"; + $noforward = "stop"; + if ($y > $startprint2) + { + if ($check_start1 > 0 || $check_start0 > 0) + { + print OUTPUT5 "\nSTOP_CONTIG, encouter start sequence\n\n"; + } + if ($check_start2 > 0) + { + print OUTPUT5 "\nSTOP_CONTIG, encouter start sequence reverse\n\n"; + } + print OUTPUT5 ">".$id."\n"; + print OUTPUT5 $read."\n"; + } + + delete $seed{$id}; + if ($check_start1 > 0 || $check_start0 > 0) + { + $contigs{$contig_num."+".$id} = $read; + $contig_num++; + } + goto SEED; + } + } + } + +REPEAT: + $read_new = $read; + $read_new1 = $read; + + if ($y > $startprint2) + { + if ($use_regex eq "yes") + { + print OUTPUT5 "USE_REGEX\n"; + } + if ($use_regex eq "yes2") + { + print OUTPUT5 "USE_REGEX2\n"; + } + } + if ($y > $startprint2) + { + if ($use_regex_back eq "yes2") + { + print OUTPUT5 "USE_REGEX_BACK2\n"; + } + if ($use_regex_back eq "yes") + { + print OUTPUT5 "USE_REGEX_BACK2\n"; + } + } + + + if (length($read) > $genome_range_low && $hp_seed_assemble eq "") + { + my $start_seq = substr $read_new, 0, 200; + my $start_seq1 = substr $read_new, 30, 42; + my $start_seq2 = substr $read_new, 60, 42; + my $start_seq3 = substr $read_new, 90, 42; + my $end_seq = substr $read_new, -200; + my $end_seq1 = substr $read_new, -72, 42; + my $end_seq2 = substr $read_new, -102, 42; + my $end_seq3 = substr $read_new, -132, 42; + my $end_seq1_merge = ""; + my $end_seq2_merge = ""; + my $end_seq3_merge = ""; + $start_seq =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $start_seq1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $start_seq2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $start_seq3 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq3 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + if ($merge_read eq "yes") + { + $end_seq1_merge = substr $read_new, -$merge_read_length+20, 42; + $end_seq2_merge = substr $read_new, -$merge_read_length+100, 42; + $end_seq3_merge = substr $read_new, -$merge_read_length+170, 42; + $end_seq1_merge =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq2_merge =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_seq3_merge =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + } + + if ($end_seq =~ m/.*.$start_seq1(.*)$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + if ($r eq '0') + { + $read_new = substr $read_new_temp, 30+42; + } + else + { + $read_new = substr $read_new_temp, 30+42, -$r; + } + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + elsif ($end_seq =~ m/.*.$start_seq2(.*)$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + if ($r eq '0') + { + $read_new = substr $read_new_temp, 60+42; + } + else + { + $read_new = substr $read_new_temp, 60+42, -$r; + } + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + elsif ($end_seq =~ m/.*.$start_seq3(.*)$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + if ($r eq '0') + { + $read_new = substr $read_new_temp, 90+42; + } + else + { + $read_new = substr $read_new_temp, 90+42, -$r; + } + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + elsif ($start_seq =~ m/(.*.)$end_seq1.*$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + $read_new = substr $read_new_temp, $r, -(42+30); + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + elsif ($start_seq =~ m/(.*.)$end_seq2.*$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + $read_new = substr $read_new_temp, $r, -(42+60); + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + elsif ($start_seq =~ m/(.*.)$end_seq3.*$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + $read_new = substr $read_new_temp, $r, -(42+90); + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + if ($merge_read eq "yes" && $circle ne "yes") + { + if ($start_seq =~ m/(.*.)$end_seq1_merge.*$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + $read_new = substr $read_new_temp, $merge_read_length-20, -$r; + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + elsif ($start_seq =~ m/(.*.)$end_seq2_merge.*$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + $read_new = substr $read_new_temp, $merge_read_length-100, -$r; + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + elsif ($start_seq =~ m/(.*.)$end_seq3_merge.*$/) + { + my $r = length($1); + my $read_new_temp = $read_new; + $read_new = substr $read_new_temp, $merge_read_length-170, -$r; + $read = $read_new; + + $circle = "yes"; + $noback = "stop"; + } + } + } + if (keys %contigs && $hp_seed_assemble eq "") + { + my $total_length = length($read); + foreach my $contig_tmp (keys %contigs) + { + $total_length = $total_length + length($contigs{$contig_tmp}); + } + if ($total_length > $genome_range_high + $genome_range_high) + { + $circle = "contigs"; + $noback = "stop"; + $noforward = "stop"; + $contigs{$contig_num."+".$id} = $read; + $contig_num++; + delete $seed{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "TOTAL_LENGTH_ABOVE_UP_LIMIT\n"; + } + goto FINISH2; + } + } + if (length($read) > $genome_range_high && $hp_seed_assemble eq "") + { + $circle = "no"; + $noforward = "stop"; + $noback = "stop"; + + $contigs{$contig_num."+".$id} = $read; + $contig_num++; + delete $seed{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "READ_ABOVE_UP_LIMIT\n"; + } + goto FINISH2; + } + if ($y > $startprint2 && $benchmark_time eq "yes") + { + $time_before_scan = time; + if ($time_before_scan - $time_start_seed > 1) + { + print OUTPUT5 $time_before_scan - $time_start_seed." TIME0b\n"; + } + } + chomp $read; + my $overlap_tmp_forward = $overlap; + my $overlap_tmp_back = $overlap; + + my $read_end_AT = substr $read_short_end2, -$read_length+$right-3; + my $A_rich_test = $read_end_AT =~ tr/A\.//; + my $T_rich_test = $read_end_AT =~ tr/T\.//; + my $G_rich_test = $read_end_AT =~ tr/G\.//; + my $C_rich_test = $read_end_AT =~ tr/C\.//; + my $AT_rich_test = $read_end_AT =~ s/AT//g; + if ($A_rich_test > $read_length-$right || $T_rich_test > $read_length-$right || $G_rich_test > $read_length-$right || $C_rich_test > $read_length-$right || $AT_rich_test > ($read_length-$right-4)/2) + { + $AT_rich = "yes"; + goto FINISH; + } +AT: my $read_end_AT2 = substr $read_short_end2, -$overlap_tmp_forward-3, $overlap+3; + my $A_rich_test2 = $read_end_AT2 =~ tr/A\.//; + my $T_rich_test2 = $read_end_AT2 =~ tr/T\.//; + my $G_rich_test2 = $read_end_AT2 =~ tr/G\.//; + my $C_rich_test2 = $read_end_AT2 =~ tr/C\.//; + my $AT_rich_test2 = $read_end_AT2 =~ s/AT//g; + + if (($A_rich_test2 > $overlap-5 || $T_rich_test2 > $overlap-5 || $G_rich_test2 > $overlap-5 || $C_rich_test2 > $overlap-5 || $AT_rich_test2 > ($overlap-8)/2) && $overlap_tmp_forward < $read_length-$right-$left) + { + $AT_rich_before = "yes"; + $overlap_tmp_forward += 5; + goto AT; + } + if ($noback eq "") + { + my $read_start_AT = substr $read_short_start2, 0, $read_length-$right+5; + my $A_rich_test3 = $read_start_AT =~ tr/A/A/; + my $T_rich_test3 = $read_start_AT =~ tr/T/T/; + my $G_rich_test3 = $read_start_AT =~ tr/G/G/; + my $C_rich_test3 = $read_start_AT =~ tr/C/C/; + my $AT_rich_test3 = $read_start_AT =~ s/AT/AT/g; + my $dot_rich_test3 = $read_start_AT =~ tr/\./\./; + if ($A_rich_test3+$dot_rich_test3 > $read_length-$right || $T_rich_test3+$dot_rich_test3 > $read_length-$right || $G_rich_test3+$dot_rich_test3 > $read_length-$right || $C_rich_test3+$dot_rich_test3 > $read_length-$right || $AT_rich_test3+$dot_rich_test3 > ($read_length-$right-4)/2) + { + $noback = "stop"; + $noback{$id} = "stop"; + goto MERGE; + } +AT_BACK: my $read_start_AT2 = substr $read_short_start2, $overlap_tmp_back-$overlap, $overlap+3; + my $A_rich_test32 = $read_start_AT2 =~ tr/A/A/; + my $T_rich_test32 = $read_start_AT2 =~ tr/T/T/; + my $G_rich_test32 = $read_start_AT2 =~ tr/G/G/; + my $C_rich_test32 = $read_start_AT2 =~ tr/C/C/; + my $AT_rich_test32 = $read_start_AT2 =~ s/AT/AT/g; + my $dot_rich_test32 = $read_start_AT2 =~ tr/\./\./; + if (($A_rich_test32+$dot_rich_test32 > $overlap-5 || $T_rich_test32+$dot_rich_test32 > $overlap-5 || $G_rich_test32+$dot_rich_test32 > $overlap-5 || $C_rich_test32+$dot_rich_test32 > $overlap-5 || $AT_rich_test32+$dot_rich_test32 > ($overlap-8)/2) && $overlap_tmp_back < $read_length-$right-$left) + { + $AT_rich_before_back = "yes"; + $overlap_tmp_back += 5; + goto AT_BACK; + print OUTPUT5 $AT_rich_before_back." AT_RICH_BACK\n"; + } + } + if ($position > $insert_size - $read_length + 300 || $noback eq "") + { + my $s = $overlap_tmp_forward - $overlap; + my $e = $overlap_tmp_back - $overlap; + + while ($s < $read_length-($overlap+$left+1-$containX_short_end2-$containX_short_end2) && $e < $read_length-($overlap+$left+1-$containX_short_start2-$containX_short_start2)) + { + my $read_end_d = substr $read_short_end2, -($s+$overlap), $overlap; + my $read_start_t = substr $read_short_start2, $e, $overlap; + + if ($containX_short_end2 > 0) + { + my $star = $read_end_d =~ tr/\*/\*/; + + $read_end_d = substr $read_short_end2, -($s+$overlap+($star*2)), $overlap+($star*2); + my $star2 = $read_end_d =~ tr/\*/\*/; + while ($star2 > $star) + { + $read_end_d = substr $read_short_end2, -($s+$overlap+($star*2)+(($star2-$star)*2)), $overlap+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $read_end_d =~ tr/\*/\*/; + } + } + if ($containX_short_start2 > 0) + { + my $star = $read_start_t =~ tr/\*/\*/; + + $read_start_t = substr $read_short_start2, $e, $overlap+($star*2); + my $star2 = $read_start_t =~ tr/\*/\*/; + while ($star2 > $star) + { + $read_start_t = substr $read_short_start2, $e, $overlap+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $read_start_t =~ tr/\*/\*/; + } + } + if ($s eq ($overlap_tmp_forward - $overlap)) + { + $read_end = substr $read_short_end2, -$overlap; + $read_start = substr $read_short_start2, 0, $overlap; + + if ($containX_short_end2 > 0) + { + my $star = $read_end =~ tr/\*/\*/; + + $read_end = substr $read_short_end2, -($overlap+($star*2)), $overlap+($star*2); + my $star2 = $read_end =~ tr/\*/\*/; + while ($star2 > $star) + { + $read_end = substr $read_short_end2, -($overlap+($star*2)+(($star2-$star)*2)), $overlap+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $read_end =~ tr/\*/\*/; + } + } + if ($containX_short_start2 > 0) + { + my $star = $read_start =~ tr/\*/\*/; + + $read_start = substr $read_short_start2, 0, $overlap+($star*2); + my $star2 = $read_start =~ tr/\*/\*/; + while ($star2 > $star) + { + $read_start = substr $read_short_start2, 0, $overlap+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $read_start =~ tr/\*/\*/; + } + } + + my $X = ""; + my $Xb = ""; + $read_end =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $read_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + my $read_end_c = $read_end; + $read_end_c =~ tr/ATCG/TAGC/; + $read_end_b = reverse($read_end_c); + + my $read_start_c = $read_start; + $read_start_c =~ tr/ATCG/TAGC/; + $read_start_b = reverse($read_start_c); + + if ($containX_short_end2 > 0) + { + $X = $read_end =~ tr/\*/\*/; + if ($X > 0) + { + %read_end = build_partial3c $read_end, ""; + %read_end_b = build_partial3c ($read_end_b, "reverse"); + } + else + { + $read_end{$read_end} = undef; + $read_end_b{$read_end_b} = undef; + } + } + else + { + $read_end{$read_end} = undef; + $read_end_b{$read_end_b} = undef; + } + if ($containX_short_start2 > 0) + { + $Xb = $read_start =~ tr/\*/\*/; + if ($Xb > 0) + { + %read_start = build_partial3c $read_start, "back"; + %read_start_b = build_partial3c ($read_start_b, "reverse_back"); + } + else + { + $read_start{$read_start} = undef; + $read_start_b{$read_start_b} = undef; + } + } + else + { + $read_start{$read_start} = undef; + $read_start_b{$read_start_b} = undef; + } + if ($y > $startprint2) + { + print OUTPUT5 $read_end." READ_END\n"; + if ($X > 0) + { + foreach (keys %read_end) + { + print OUTPUT5 $_."\n"; + } + } + print OUTPUT5 $read_start." READ_START\n"; + if ($Xb > 0) + { + foreach (keys %read_start) + { + print OUTPUT5 $_."\n"; + } + } + } + } + if ($noforward ne "stop") + { + my $read_end_c = $read_end_d; + + $read_end_c =~ tr/ATCG/TAGC/; + my $read_end_e = reverse($read_end_c); + + + my %read_end_e; + undef %read_end_e; + + if ($contain_dot_short_end2 > 0 || $containX_short_end2 > 0) + { + my $dot = $read_end_e =~ tr/\./\./; + my $X = $read_end_e =~ tr/\*/\*/; + if ($X > 0 || $dot > 0) + { + %read_end_e = build_partial3c ($read_end_e, "reverse"); + } + else + { + $read_end_e{$read_end_e} = undef; + } + } + else + { + $read_end_e{$read_end_e} = undef; + } + if ($use_regex ne "" || $last_chance eq "yes") + { + my %read_end_e = build_partial3b ($read_end_e, "reverse"); + my $X_test = $read_end_e =~ tr/\./\./; + my %list; + undef %list; + if ($X_test < 2 && $use_regex ne "yes2" && $hp_seed_assemble eq "") + { + %list = build_partial2b %read_end_e; + %read_end_e = %list; + } + + foreach my $list (keys %read_end_e) + { + if (exists($hash2c{$list})) + { + my $search = $hash2c{$list}; + + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match1{$search} = $found; + $merged_match_pos{$search} = $s; + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + if ($position-($s+$overlap) < $SNP_tmp && ($position-$s) >= $SNP_tmp) + { + $accepted_SNPs{$search} = undef; + } + } + } + } + } + } + } + } + else + { + foreach my $read_end_e2 (keys %read_end_e) + { + if (exists($hash2c{$read_end_e2})) + { + my $search = $hash2c{$read_end_e2}; + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match1{$search} = $found; + $merged_match_pos{$search} = $s; + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + if ($position-($s+$overlap) < $SNP_tmp && ($position-$s) >= $SNP_tmp) + { + $accepted_SNPs{$search} = undef; + } + } + } + } + } + } + } + } + if ($last_chance eq "yes") + { + my %read_end_d; + undef %read_end_d; + + if ($contain_dot_short_end2 > 0 || $containX_short_end2 > 0) + { + my $dot = $read_end_d =~ tr/\./\./; + my $X = $read_end_d =~ tr/\*/\*/; + if ($X > 0 || $dot > 0) + { + %read_end_d = build_partial3b $read_end_d, ""; + } + else + { + $read_end_d{$read_end_d} = undef; + } + } + else + { + $read_end_d{$read_end_d} = undef; + } + + my $X_test = $read_end_d =~ tr/\./\./; + my %list; + undef %list; + if ($X_test < 2 && $use_regex ne "yes2" && $hp_seed_assemble eq "") + { + %list = build_partial2b %read_end_d; + } + else + { + %list = %read_end_d; + } + foreach my $list (keys %list) + { + if (exists($hash2b{$list})) + { + my $search = $hash2b{$list}; + + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match2{$search} = $found; + $merged_match_pos{$search} = $s; + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + if ($position-($s+$overlap) < $SNP_tmp && ($position-$s) >= $SNP_tmp) + { + $accepted_SNPs{$search} = undef; + } + } + } + } + } + } + } + } + } + if ($noback ne "stop") + { + my %read_start_t; + undef %read_start_t; + + if ($contain_dot_short_start2 > 0 || $containX_short_start2 > 0) + { + my $dot = $read_start_t =~ tr/\./\./; + my $X = $read_start_t =~ tr/\*/\*/; + if ($X > 0 || $dot > 0) + { + %read_start_t = build_partial3c ($read_start_t, "back"); + } + else + { + $read_start_t{$read_start_t} = undef; + } + } + else + { + $read_start_t{$read_start_t} = undef; + } + if ($last_chance_back eq "yes" || $use_regex_back ne "") + { + my %read_start_t = build_partial3b ($read_start_t, "back"); + my $X_test = $read_start_t =~ tr/\./\./; + my %list; + undef %list; + if ($X_test < 2 && $use_regex_back ne "yes2" && $hp_seed_assemble eq "") + { + %list = build_partial2b %read_start_t; + %read_start_t = %list; + } + foreach my $list (keys %read_start_t) + { + if (exists($hash2c{$list})) + { + my $search = $hash2c{$list}; + + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match_back1{$search} = $found; + $merged_match_back_pos{$search} = $e; + + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + $SNP_tmp *= -1; + if ($position_back-$e > $SNP_tmp && ($position_back-$e-$overlap) <= $SNP_tmp) + { + $accepted_SNPs_back{$search} = undef; + } + } + } + } + } + } + } + } + else + { + foreach my $read_start_t2 (keys %read_start_t) + { + if (exists($hash2c{$read_start_t2})) + { + my $search = $hash2c{$read_start_t2}; + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match_back1{$search} = $found; + $merged_match_back_pos{$search} = $e; + + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + $SNP_tmp *= -1; + if ($position_back-$e > $SNP_tmp && ($position_back-$e-$overlap) <= $SNP_tmp) + { + $accepted_SNPs_back{$search} = undef; + } + } + } + } + } + } + } + } + if ($last_chance_back eq "yes") + { + my $read_start_c = $read_start_t; + $read_start_c =~ tr/ATCG/TAGC/; + my $read_start_e = reverse($read_start_c); + + my %read_start_e; + undef %read_start_e; + + if ($contain_dot_short_start2 > 0 || $containX_short_start2 > 0) + { + my $dot = $read_start_e =~ tr/\./\./; + my $X = $read_start_e =~ tr/\*/\*/; + if ($X > 0 || $dot > 0) + { + %read_start_e = build_partial3c ($read_start_e, "reverse_back"); + } + else + { + $read_start_e{$read_start_e} = undef; + } + } + else + { + $read_start_e{$read_start_e} = undef; + } + %read_start_e = build_partial3b ($read_start_e, "reverse_back"); + my $X_test = $read_start_e =~ tr/\./\./; + my %list; + undef %list; + if ($X_test < 2 && $use_regex_back ne "yes2" && $hp_seed_assemble eq "") + { + %list = build_partial2b %read_start_e; + %read_start_e = %list; + } + foreach my $list (keys %read_start_e) + { + if (exists($hash2b{$list})) + { + my $search = $hash2b{$list}; + + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match_back2{$search} = $found; + $merged_match_back_pos{$search} = $e; + + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + $SNP_tmp *= -1; + if ($position_back-$e > $SNP_tmp && ($position_back-$e-$overlap) <= $SNP_tmp) + { + $accepted_SNPs_back{$search} = undef; + } + } + } + } + } + } + } + } + } + $s++; + $e++; + } + } + else + { + my $s = $overlap_tmp_forward - $overlap; + while ($s < $read_length-($overlap+$right)) + { + my $read_end_d = substr $read, -($s+$overlap), $overlap; + $read_end_d =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + if ($s eq ($overlap_tmp_forward - $overlap)) + { + $read_end = substr $read, -$overlap, $overlap; + $read_end =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + my $read_end_c = $read_end; + $read_end_c =~ tr/ATCG/TAGC/; + $read_end_b = reverse($read_end_c); + + $read_end{$read_end} = undef; + $read_end_b{$read_end_b} = undef; + if ($y > $startprint2) + { + print OUTPUT5 $read_end." READ_END\n"; + foreach (keys %read_end) + { + print OUTPUT5 $_."\n"; + } + } + } + my $read_end_c = $read_end_d; + $read_end_c =~ tr/ATCG/TAGC/; + my $read_end_e = reverse($read_end_c); + + if ($use_regex eq "yes" || $use_regex eq "yes2") + { + my %list = build_partial3b ($read_end_e, "reverse"); + + foreach my $list (keys %list) + { + if (exists($hash2c{$list})) + { + my $search = $hash2c{$list}; + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match1{$search} = $found; + $merged_match_pos{$search} = $s; + + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + if ($position-($s+$overlap) < $SNP_tmp && ($position-$s) >= $SNP_tmp) + { + $accepted_SNPs{$search} = undef; + } + } + } + } + } + } + } + } + else + { + if (exists($hash2c{$read_end_e})) + { + my $search = $hash2c{$read_end_e}; + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match1{$search} = $found; + $merged_match_pos{$search} = $s; + + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + if ($position-($s+$overlap) < $SNP_tmp && ($position-$s) >= $SNP_tmp) + { + $accepted_SNPs{$search} = undef; + } + } + } + } + } + } + } + if ($last_chance eq "yes") + { + my %read_end_d; + undef %read_end_d; + + if ($contain_dot_short_end2 > 0 || $containX_short_end2 > 0) + { + my $dot = $read_end_d =~ tr/\./\./; + my $X = $read_end_d =~ tr/\*/\*/; + if ($X > 0 || $dot > 0) + { + %read_end_d = build_partial3b ($read_end_d, ""); + } + else + { + $read_end_d{$read_end_d} = undef; + } + } + else + { + $read_end_d{$read_end_d} = undef; + } + + my $X_test = $read_end_d =~ tr/\./\./; + my %list; + undef %list; + if ($X_test < 2 && $use_regex ne "yes2" && $hp_seed_assemble eq "") + { + %list = build_partial2b %read_end_d; + } + else + { + %list = %read_end_d; + } + foreach my $list (keys %list) + { + if (exists($hash2b{$list})) + { + my $search = $hash2b{$list}; + + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $merged_match2{$search} = $found; + $merged_match_pos{$search} = $s; + + if ($hp_seed_assemble ne "") + { + foreach my $SNP_tmp (keys %SNPs) + { + if ($position-($s+$overlap) < $SNP_tmp && ($position-$s) >= $SNP_tmp) + { + $accepted_SNPs{$search} = undef; + } + } + } + } + } + } + } + } + $s++; + } + } + %merged_match = (%merged_match1, %merged_match2); + %merged_match_back = (%merged_match_back1, %merged_match_back2); + my $mmr = '0'; + my $mmbr = '0'; + foreach (keys %merged_match) + { + $mmr++; + } + foreach (keys %merged_match_back) + { + $mmbr++; + } + if ($y > $startprint2) + { + print OUTPUT5 $mmr." MATCH_ARRAY_READ\n"; + print OUTPUT5 $mmbr." MATCH_ARRAY_BACK_READ\n"; + } + foreach my $add_read2 (keys %merged_match) + { + my $add_read = substr $add_read2, 0, -1; + $count_reads_all{$add_read} = undef; + } + foreach my $add_read2 (keys %merged_match_back) + { + my $add_read = substr $add_read2, 0, -1; + $count_reads_all{$add_read} = undef; + } +REGEX: + my $time_for_FOUND = '0'; + my $time_for_NO_MATCH = '0'; + my $time_test = '0'; + + my $X4 = $read_short_end =~ tr/\*//; + + if ($X4 > 0) + { + %read_short_end_tmp = build_partial3c ($read_short_end, ""); + } + else + { + $read_short_end_tmp{$read_short_end} = undef; + } + + my $read_count = '0'; + my $read_ex = '0'; + my $read2_ex = '0'; + + if ($y > $startprint2) + { + if ($use_regex ne "") + { + print OUTPUT5 "USE_REGEX_REVERSE\n"; + } + } + if ($y > $startprint2 && $benchmark_time eq "yes") + { + $time_end_hash_scan = time; + if ($time_end_hash_scan - $time_before_scan > 3) + { + print OUTPUT5 $time_end_hash_scan - $time_before_scan." TIME1\n"; + } + } + if ($noforward eq "stop") + { + goto BACK; + } + my $extra_overlap; + my $read_short_end_tempie ; + my $test_dot; + my $test_star; + my %hash_read_short_end; + my %hash_read_short_end_dot; + if ($last_chance ne "yes") + { + $extra_overlap = sprintf("%.0f", (($read_length-$overlap)/3)); + $read_short_end_tempie = substr $read, -($insert_size*$insert_range)+10, ((($insert_size*$insert_range)-$insert_size)*2)+$read_length-($read_length-$overlap-$extra_overlap-10)+($read_length-$left-$right-$overlap); + $read_short_end_tempie =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + undef %read_short_end_tmp; + $test_dot = $read_short_end_tempie =~ tr/\./\./; + $test_star = $read_short_end_tempie =~ tr/\*/\*/; + + if ($test_dot > 3 || $mmr < 20) + { + %read_short_end_tmp = build_partial3c $read_short_end_tempie; + } + else + { + %read_short_end_tmp = build_partial3b $read_short_end_tempie; + } + if ($mmr > 19) + { + my $ff = '0'; + undef %hash_read_short_end; + undef %hash_read_short_end_dot; + foreach my $read_short_end_tempie (keys %read_short_end_tmp) + { + $ff = '0'; + while ($ff < (length($read_short_end_tempie)-($overlap+$extra_overlap)-1)) + { + my $read_short_end_part = substr $read_short_end_tempie, $ff, $overlap+$extra_overlap; + if ($test_dot < 6 && $test_dot > 3) + { + my $test_dot2 = $read_short_end_part =~ tr/\./\./; + if ($test_dot2 > 0) + { + $hash_read_short_end_dot{$read_short_end_part} = $ff; + } + else + { + $hash_read_short_end{$read_short_end_part} = $ff; + } + } + else + { + $hash_read_short_end{$read_short_end_part} = $ff; + } + $ff++; + } + } + } + } + +NO_MATCH: foreach my $ln (keys %merged_match) + { + $match = $merged_match{$ln}; + $id_match = $ln; + chomp $id_match; + chomp $match; + + my $time_no_match = time; + + if ($hp_seed_assemble ne "" && $last_chance eq "yes") + { + my $check = ""; + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp <= $position && $pos_tmp > $position-length($match)) + { + $check = "yes"; + } + } + if ($check ne "yes") + { + next NO_MATCH; + } + } + if ($last_chance eq "yes") + { + my $forward = ""; + if (exists($merged_match1{$ln})) + { + my $match_reverse = reverse($match); + $match_reverse =~ tr/ACTG/TGAC/; + my $test = substr $match_reverse, ($merged_match_pos{$ln}+$right), $overlap; + + my $last_nuc = substr $test, -1, 1; + if ($heteroplasmy ne "" && ($last_nuc eq "1" || $last_nuc eq "2" || $last_nuc eq "3" || $last_nuc eq "4" || $last_nuc eq "N") && $SNR_read eq "") + { + next NO_MATCH; + } + if ($use_quality ne "") + { + $test =~ tr/1234/TGAC/; + my $countN = $match =~ tr/1234//; + if ($countN > length($match)*0.35) + { + next NO_MATCH; + } + } + foreach my $read_end_tmp (keys %read_end) + { + if ($test eq $read_end_tmp) + { + $extension = substr $match_reverse, ($merged_match_pos{$ln}+$right+$overlap); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto LAST1; + } + } + my $test_N1 = $read_end_b =~ tr/N/\./; + my $test_N2 = $test =~ tr/N/\./; + + if ($test_N1 > 0 && $test_N2 eq '0' && $test_N1 < $overlap/6) + { + foreach my $read_end_tmp (keys %read_end) + { + if ($test =~ m/$read_end_tmp/) + { + $extension = substr $match_reverse, ($merged_match_pos{$ln}+$right+$overlap); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto LAST1; + } + } + } + elsif ($test_N2 > 0 && $test_N1 eq '0' && $test_N2 < $overlap/6) + { + foreach my $read_end_tmp (keys %read_end) + { + if ($read_end_tmp =~ m/$test/) + { + $extension = substr $match_reverse, ($merged_match_pos{$ln}+$right+$overlap); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto LAST1; + } + } + } + foreach my $read_end_tmp (keys %read_end) + { + my @test_tmp = split //, $test; + my @read_end_tmp = split //, $read_end_tmp; + + my $d = '0'; + my $next = ""; + my $N = '0'; + + while ($d < length($read_end_tmp)) + { + if ($test_tmp[$d] eq $read_end_tmp[$d]) + { + } + elsif ($read_end_tmp[$d] eq ".") + { + } + elsif ($test_tmp[$d] eq "." && $N < '5') + { + $N++; + } + elsif ($next eq "" && $hp_seed_assemble eq "" && length($read) > $insert_size+200) + { + $next = "yes"; + } + elsif ($next eq "yes") + { + $next = "yes2"; + } + elsif ($next eq "yes2") + { + $next = "yes3"; + } + else + { + next NO_MATCH; + } + $d++ + } + + $extension = substr $match_reverse, ($merged_match_pos{$ln}+$right+$overlap); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto LAST1; + } + } + elsif (exists($merged_match2{$ln})) + { + my $test = substr $match, $merged_match_pos{$ln}+$left, $overlap; + + my $last_nuc = substr $match, $merged_match_pos{$ln}+$left+$overlap+1, 1; + if ($heteroplasmy ne "" && ($last_nuc eq "1" || $last_nuc eq "2" || $last_nuc eq "3" || $last_nuc eq "4" || $last_nuc eq "N") && $SNR_read eq "") + { + next NO_MATCH; + } + if ($use_quality ne "") + { + $test =~ tr/1234/ACTG/; + my $countN = $match =~ tr/1234//; + if ($countN > length($match)*0.35) + { + next NO_MATCH; + } + } + foreach my $read_end_tmp (keys %read_end) + { + if ($test eq $read_end_tmp) + { + $extension = substr $match, $merged_match_pos{$ln}+$left+$overlap; + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + $forward = "yes"; + + $read_count++; + goto LAST1; + } + } + my $test_N1 = $read_end =~ tr/N/\./; + my $test_N2 = $test =~ tr/N/\./; + + if ($test_N1 > 0 && $test_N2 eq '0' && $test_N1 < $overlap/6) + { + foreach my $read_end_tmp (keys %read_end) + { + if ($test =~ m/$read_end_tmp/) + { + $extension = substr $match, $merged_match_pos{$ln}+$left+$overlap; + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + $forward = "yes"; + + $read_count++; + goto LAST1; + } + } + } + elsif ($test_N2 > 0 && $test_N1 eq '0' && $test_N2 < $overlap/6) + { + foreach my $read_end_tmp (keys %read_end) + { + if ($read_end_tmp =~ m/$test/) + { + $extension = substr $match, $merged_match_pos{$ln}+$left+$overlap; + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + $forward = "yes"; + + $read_count++; + goto LAST1; + } + } + } + foreach my $read_end_tmp (keys %read_end) + { + my @test_tmp = split //, $test; + my @read_end_tmp = split //, $read_end_tmp; + + my $d = '0'; + my $next = ""; + my $N = '0'; + + while ($d < length($read_end_tmp)) + { + if ($test_tmp[$d] eq $read_end_tmp[$d]) + { + } + elsif ($read_end_tmp[$d] eq ".") + { + } + elsif ($test_tmp[$d] eq "." && $N < '5') + { + $N++; + } + elsif ($next eq "" && $hp_seed_assemble eq "" && length($read) > $insert_size+200) + { + $next = "yes"; + } + elsif ($next eq "yes") + { + $next = "yes2"; + } + elsif ($next eq "yes2") + { + $next = "yes3"; + } + elsif ($containX_short_end2 > 0) + { + goto STAR; + } + else + { + next NO_MATCH; + } + $d++ + } + + $extension = substr $match, $merged_match_pos{$ln}+$left+$overlap; + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + $forward = "yes"; + + $read_count++; + goto LAST1; + } + } + +STAR: if (exists($merged_match1{$ln})) + { + if ($containX_short_end2 > 0) + { + foreach my $line (keys %read_end_b) + { + my @read_end_b_sub; + undef @read_end_b_sub; + if ($position > $insert_size+200 && $hp_seed_assemble eq "" && $read_count < 60) + { + @read_end_b_sub = build_partialb $line; + } + else + { + push @read_end_b_sub, $line; + } + + my $found_seq = '0'; + my $match2 = $match; + if ($use_quality ne "") + { + $match2 =~ tr/1234/TGAC/; + } + + foreach my $read_end_b_sub (@read_end_b_sub) + { + $found_seq = $match2 =~ s/(.)$read_end_b_sub/$1+/; + if ($found_seq > 1) + { + my $pos = $merged_match_pos{$ln}; + my $match4b = substr $match, 0, -$pos; + $match4b =~ s/(.+)$read_end_b_sub/$1+/; + my @ext = split /\+/, $match4b; + my $extension5 = $ext[0]; + $extension5 =~ tr/ATCG/TAGC/; + $extension = reverse ($extension5); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto LAST1; + } + elsif ($found_seq > 0) + { + my @ext = split /\+/, $match2; + my $extension5 = $ext[0]; + $extension5 =~ tr/ATCG/TAGC/; + $extension = reverse($extension5); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto LAST1; + } + } + } + } + } + elsif (exists($merged_match2{$ln})) + { + if ($containX_short_end2 > 0) + { + foreach my $line (keys %read_end) + { + my @read_end_sub; + undef @read_end_sub; + if (length($read) > $insert_size+200 && $hp_seed_assemble eq "" && $read_count < 60) + { + @read_end_sub = build_partialb $line; + } + else + { + push @read_end_sub, $line; + } + my $found_seq = '0'; + my $match2 = $match; + if ($use_quality ne "") + { + $match2 =~ tr/1234/ACTG/; + } + + foreach my $read_end_sub (@read_end_sub) + { + $found_seq = $match2 =~ s/.$read_end_sub/+/; + + if ($found_seq > 1) + { + my $pos = $merged_match_pos{$ln}; + my $match4b = substr $match, $pos-2; + $match4b =~ s/.$read_end_sub/+/; + my @ext = split /\+/, $match4b; + $extension = $ext[1]; + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto LAST1; + } + elsif ($found_seq > 0) + { + my @ext = split /\+/, $match2; + $extension = $ext[1]; + + $read_count++; + $forward = "yes"; + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + goto LAST1; + } + } + } + } + } + next NO_MATCH; + +LAST1: my $id_match_end = substr $id_match, -1, 1; + my $id_match_tmp = substr $id_match, 0, -1,; + + if ($hp_seed_assemble ne "") + { + my $check = ""; + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp <= $position && $pos_tmp > ($position-length($match)+length($extension))) + { + if (exists($merged_match1{$ln})) + { + my $match_tmp = reverse($match); + $match_tmp =~ tr/ACTG/TGAC/; + my $hp_SNP_read = substr $match_tmp, -(length($extension)+($position-$pos_tmp)+1), 1; + if ($hp_SNP_read eq $SNPs{$pos_tmp}) + { + $check = "yes"; + } + else + { + next NO_MATCH; + } + } + elsif (exists($merged_match2{$ln})) + { + my $hp_SNP_read = substr $match, -(length($extension)+($position-$pos_tmp)+1), 1; + if ($hp_SNP_read eq $SNPs{$pos_tmp}) + { + $check = "yes"; + } + else + { + next NO_MATCH; + } + } + } + } + if (exists($accepted_SNPs{$ln})) + { + $check = "yes"; + } + if ($check ne "yes") + { + next NO_MATCH; + } + } + if (index ($id_match_tmp, $id) eq "-1" && $extension ne "NOOO") + { + my $nuc_exlude = "test"; + if ($yuyu_option_A eq "A" || $yuyu_option_C eq "C" || $yuyu_option_T eq "T" || $yuyu_option_G eq "G") + { + $nuc_exlude = substr $extension, 0, 1; + } + if ($extension ne " " && $extension ne "" && $nuc_exlude ne $yuyu_option_A && $nuc_exlude ne $yuyu_option_C && $nuc_exlude ne $yuyu_option_T && $nuc_exlude ne $yuyu_option_G) + { + if ($use_quality ne "" && $SNR_read2 eq "") + { + $extension =~ tr/1|2|3|4/N/; + } + elsif ($use_quality ne "") + { + $extension =~ tr/1234/ACTG/; + } + push @matches, $id_match.",".$extension.","."".",".$match.",".""; + if ($forward eq "yes") + { + $extensions2b{$id_match} = $extension; + } + else + { + $extensions1b{$id_match} = $extension; + } + $extensions2{$extension} = $id_match; + push @extensions2, $extension; + if ($save_reads ne "") + { + my $add_read = substr $id_match, 0, -1; + if (exists($save_reads{$add_read})) + { + } + else + { + $save_reads{$add_read} = undef; + if ($save_reads eq "2") + { + my $add_read2 = $map_ids{$add_read}; + print OUTPUT10 $add_read2."\/1\n"; + print OUTPUT11 $add_read2."\/2\n"; + } + else + { + print OUTPUT10 ">".$add_read."\/1\n"; + print OUTPUT11 ">".$add_read."\/2\n"; + } + if (exists($hash{$add_read})) + { + my @add_read = split /,/,$hash{$add_read}; + my $forward = $add_read[0]; + my $reverse = $add_read[1]; + if ($use_quality ne "") + { + $forward =~tr/1234/ACTG/; + $reverse =~tr/1234/ACTG/; + } + print OUTPUT10 $forward."\n"; + print OUTPUT11 $reverse."\n"; + } + } + } + } + } + next NO_MATCH; + } + else + { + my $match_reverse = reverse($match); + $match_reverse =~ tr/ACTG/TGAC/; + my $test = substr $match_reverse, ($merged_match_pos{$ln}+$right), $overlap; + + my $last_nuc = substr $test, -1, 1; + if ($heteroplasmy ne "" && ($last_nuc eq "1" || $last_nuc eq "2" || $last_nuc eq "3" || $last_nuc eq "4" || $last_nuc eq "N") && $SNR_read eq "") + { + next NO_MATCH; + } + if ($use_quality ne "") + { + $test =~ tr/1234/TGAC/; + my $countN = $match =~ tr/1234//; + if ($countN > length($match)*0.35) + { + next NO_MATCH; + } + } + + foreach my $read_end_tmp (keys %read_end) + { + if ($test eq $read_end_tmp) + { + $extension = substr $match_reverse, ($merged_match_pos{$ln}+$right+$overlap); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto FOUND; + } + } + + my $test_N1 = $read_end =~ tr/N/\./; + my $test_N2 = $test =~ tr/N/\./; + + if ($test_N1 > 0 && $test_N2 eq '0' && $test_N1 < $overlap/5) + { + foreach my $read_end_tmp (keys %read_end) + { + if ($test =~ m/$read_end_tmp/) + { + $extension = substr $match_reverse, ($merged_match_pos{$ln}+$right+$overlap); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto FOUND; + } + } + } + elsif ($test_N2 > 0 && $test_N1 eq '0' && $test_N2 < $overlap/5) + { + foreach my $read_end_tmp (keys %read_end) + { + if ($read_end_tmp =~ m/$test/) + { + $extension = substr $match_reverse, ($merged_match_pos{$ln}+$right+$overlap); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto FOUND; + } + } + } + + foreach my $read_end_tmp (keys %read_end) + { + my @test_tmp = split //, $test; + my @read_end_tmp = split //, $read_end_tmp; + + my $d = '0'; + my $next = ""; + my $N = '0'; + + while ($d < length($read_end_tmp)) + { + if ($test_tmp[$d] eq $read_end_tmp[$d]) + { + } + elsif ($read_end_tmp[$d] eq ".") + { + } + elsif ($test_tmp[$d] eq "." && $N < '5') + { + $N++; + } + elsif ($next eq "" && $hp_seed_assemble eq "" && $use_regex eq "yes") + { + $next = "yes"; + } + elsif ($next eq "yes") + { + $next = "yes2"; + } + elsif ($next eq "yes2") + { + $next = "yes3"; + } + elsif ($containX_short_end2 > 0) + { + goto STAR2; + } + else + { + next NO_MATCH; + } + $d++ + } + + $extension = substr $match_reverse, ($merged_match_pos{$ln}+$right+$overlap); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto FOUND; + } + } + +STAR2: if ($containX_short_end2 > 0) + { + foreach my $line (keys %read_end_b) + { + my $found_seq = '0'; + my $match4 = $match; + my @read_end_b_sub; + if ($hp_seed_assemble eq "" && $use_regex eq "yes") + { + @read_end_b_sub = build_partialb $line; + } + else + { + push @read_end_b_sub, $line; + } + + foreach my $read_end_b_subc (@read_end_b_sub) + { + $found_seq = $match4 =~ s/(.)$read_end_b_subc/$1+/g; + if ($found_seq > 1) + { + my $pos = $merged_match_pos{$ln}; + my $match4b = substr $match, 0, -$pos; + $match4b =~ s/(.+)$line/$1+/; + my @ext = split /\+/, $match4b; + my $extension5 = $ext[0]; + $extension5 =~ tr/ATCG/TAGC/; + $extension = reverse ($extension5); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto FOUND; + } + elsif ($found_seq > 0) + { + my @ext = split /\+/, $match4; + my $extension5 = $ext[0]; + $extension5 =~ tr/ATCG/TAGC/; + $extension = reverse ($extension5); + $extensions_for_before{$id_match} = $extension; + $extensions_for_before2{$id_match} = $match; + + $read_count++; + goto FOUND; + } + } + } + } + next NO_MATCH; + +FOUND: if ($last_chance eq "yes") + { + next NO_MATCH; + } + if ($hp_seed_assemble ne "" && $last_chance ne "yes") + { + my $check = ""; + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp <= $position && $pos_tmp > ($position-length($match)+length($extension))) + { + if (exists($merged_match1{$ln})) + { + my $match_tmp = reverse($match); + $match_tmp =~ tr/ACTG/TGAC/; + my $hp_SNP_read = substr $match_tmp, -(length($extension)+($position-$pos_tmp)+1), 1; + if ($hp_SNP_read eq $SNPs{$pos_tmp}) + { + $check = "yes"; + } + else + { + delete $accepted_SNPs{$ln}; + next NO_MATCH; + } + } + } + } + if ($check eq "yes") + { + $accepted_SNPs{$ln} = undef; + } + } + if ($extension ne "NOOO") + { + my $time_FOUND = time; + $time_for_NO_MATCH += ($time_FOUND-$time_no_match); + my $id_match_b = $id_match; + my $id_match_end = substr $id_match_b, -1, 1,"",; + + if (exists($hash{$id_match_b})) + { + my @id_match_b = split /,/, $hash{$id_match_b}; + + if ($id_match_end eq "1") + { + $match_pair = $id_match_b[1]; + } + elsif ($id_match_end eq "2") + { + $match_pair = $id_match_b[0]; + } + else + { + next NO_MATCH; + } + chomp($match_pair); + if ($encrypt eq "yes") + { + $match_pair = decrypt $match_pair; + } + $match_pair =~ tr/1234/ACTG/; + + my $match_pair_middle = substr $match_pair, 10, $overlap+$extra_overlap; + my $countN = $match_pair_middle =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + if ($use_quality ne "") + { + $match_pair_middle =~ tr/1234/ACTG/; + } + my $extra_regex_tmp = ""; + if ($countN > 0 && $countN < length($match_pair_middle)/5) + { + $extra_regex_tmp = "yes"; + } + elsif ($countN >= length($match_pair_middle)/5) + { + next NO_MATCH; + } + if ($test_dot > 5 || $extra_regex_tmp eq "yes") + { + my $time_FOUND_EXTRA_REGEX = time; + if (exists($hash_read_short_end{$match_pair_middle})) + { + $extension_match = ""; + if ($insert_size_correct eq "yes" && length($read) > $insert_size*$insert_range) + { + my $cal = ($insert_size*$insert_range)-10; + if ($cal > length($read)) + { + $cal = length($read); + } + my $insert_size_tmp = $cal-$hash_read_short_end{$match_pair_middle}+10+length($extension); + push @insert_size, $insert_size_tmp; + } + if ($hp_seed_assemble ne "") + { + check_HP_pos($hash_read_short_end{$match_pair_middle}, $position, $position_back, $match_pair, $ln); + } + goto SKIP3; + } + my @match_pair_middle_sub = split //, $match_pair_middle; + my $gh = '0'; + $extension_match = ""; + foreach my $line (keys %read_short_end_tmp) + { + my @line = split //,$line; + $gh = '0'; + $line =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + +CHECK_PAIR: while ($gh < length($line)-length($match_pair_middle)) + { + my $d = '0'; + my $next = ""; + my $th = '0'; + my $N = '0'; + if ($use_regex ne "yes") + { + $next = "yes3"; + } + + while ($d < length($match_pair_middle)) + { + $th = $d + $gh; + if ($match_pair_middle_sub[$d] eq $line[$th]) + { + } + elsif ($line[$th] eq ".") + { + } + elsif ($match_pair_middle_sub[$d] eq ".") + { + $N++; + } + elsif ($next eq "" && $hp_seed_assemble eq "") + { + $next = "yes"; + } + elsif ($next eq "yes") + { + $next = "yes2"; + } + elsif ($next eq "yes2") + { + $next = "yes3"; + } + else + { + $gh++; + goto CHECK_PAIR; + } + $d++ + } + goto CHECK_PAIR0; + } + } + $extension_match = "NOOO"; + +CHECK_PAIR0: $counttest1++; + if ($insert_size_correct eq "yes" && length($read) > $insert_size*$insert_range) + { + my $cal = ($insert_size*$insert_range)-10; + if ($cal > length($read)) + { + $cal = length($read); + } + my $insert_size_tmp = $cal-$gh+10+length($extension); + push @insert_size, $insert_size_tmp; + } + $time_test += (time-$time_FOUND_EXTRA_REGEX); + if ($hp_seed_assemble ne "") + { + check_HP_pos($gh, $position, $position_back, $match_pair, $ln); + } + goto SKIP3; + } + else + { + my $found_seq = '0'; + my $line_tmpb; + if (exists($hash_read_short_end{$match_pair_middle})) + { + if ($insert_size_correct eq "yes" && length($read) > $insert_size*$insert_range) + { + my $cal = ($insert_size*$insert_range)-10; + if ($cal > length($read)) + { + $cal = length($read); + } + my $insert_size_tmp = $cal-$hash_read_short_end{$match_pair_middle}+10+length($extension); + push @insert_size, $insert_size_tmp; + } + $extension_match = ""; + if ($hp_seed_assemble ne "") + { + check_HP_pos($hash_read_short_end{$match_pair_middle}, $position, $position_back, $match_pair, $ln); + } + goto SKIP3; + } + if ($test_dot > 3 && $mmr > 19) + { + my $match_pair_middle_tmp = $match_pair_middle; + CHECK_PAIR2: foreach my $line (keys %hash_read_short_end_dot) + { + my $found_seq = '0'; + my $star_first = substr $line, 0, 1; + if ($star_first eq "*") + { + $line = substr $line, 1; + $match_pair_middle_tmp = substr $match_pair_middle_tmp, 1; + } + $found_seq = $match_pair_middle_tmp =~ s/$line/+/; + $line_tmpb = $read_short_end; + if ($found_seq > 0) + { + last CHECK_PAIR2; + } + } + if ($found_seq > 0) + { + if ($insert_size_correct eq "yes" && length($read) > $insert_size*$insert_range) + { + my $cal = ($insert_size*$insert_range)-10; + if ($cal > length($read)) + { + $cal = length($read); + } + my $insert_size_tmp = $cal-$hash_read_short_end_dot{$match_pair_middle}+10+length($extension); + push @insert_size, $insert_size_tmp; + } + $extension_match = ""; + if ($hp_seed_assemble ne "") + { + check_HP_pos($hash_read_short_end_dot{$match_pair_middle}, $position, $position_back, $match_pair, $ln); + } + goto SKIP3; + } + else + { + $extension_match = "NOOO"; + } + } + elsif ($test_dot > 0 || $test_star > '0' || $mmr < 19) + { + foreach my $line (keys %read_short_end_tmp) + { + $found_seq = '0'; + $found_seq = $line =~ s/$match_pair_middle/+/; + if ($found_seq > 0) + { + $extension_match = ""; + if ($hp_seed_assemble ne "") + { + my @split = split /\+/, $line; + my $th = length($split[0]); + check_HP_pos($th, $position, $position_back, $match_pair, $ln); + } + goto SKIP3; + } + } + } + $extension_match = "NOOO"; + } +SKIP3: + my $nuc_exlude = "test"; + if ($yuyu_option_A eq "A" || $yuyu_option_C eq "C" || $yuyu_option_T eq "T" || $yuyu_option_G eq "G") + { + $nuc_exlude = substr $extension, 0, 1; + } + if ($hp_seed_assemble ne "") + { + my $check = ""; + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp <= $position && $pos_tmp > $position-$overlap) + { + $check = "yes"; + } + } + if (exists($accepted_SNPs{$ln})) + { + $check = "yes"; + } + elsif (exists($accepted_SNPs_pair{$ln})) + { + $check = "yes"; + } + if ($check ne "yes") + { + $extension_match = "NOOO"; + } + } + if ($extension_match ne "NOOO" && ($extension ne " " && $extension ne "") && $nuc_exlude ne $yuyu_option_A && $nuc_exlude ne $yuyu_option_C && $nuc_exlude ne $yuyu_option_T && $nuc_exlude ne $yuyu_option_G) + { + $read_ex++; + push @matches, $id_match.",".$extension.","."".",".$match.",".$match_pair; + + if ($extension ne " " && $extension ne "") + { + if ($use_quality ne "" && $SNR_read2 eq "") + { + $extension =~ tr/1|2|3|4/N/; + } + elsif ($use_quality ne "") + { + $extension =~ tr/1234/ACTG/; + } + $extensions1{$extension} = $id_match; + $extensions1b{$id_match} = $extension; + push @extensions1, $extension; + if ($save_reads ne "") + { + my $add_read = substr $id_match, 0, -1; + if (exists($save_reads{$add_read})) + { + } + else + { + $save_reads{$add_read} = undef; + if ($save_reads eq "2") + { + my $add_read2 = $map_ids{$add_read}; + print OUTPUT10 $add_read2."\/1\n"; + print OUTPUT11 $add_read2."\/2\n"; + } + else + { + print OUTPUT10 ">".$add_read."\/1\n"; + print OUTPUT11 ">".$add_read."\/2\n"; + } + if (exists($hash{$add_read})) + { + my @add_read = split /,/,$hash{$add_read}; + my $forward = $add_read[0]; + my $reverse = $add_read[1]; + if ($use_quality ne "") + { + $forward =~tr/1234/ACTG/; + $reverse =~tr/1234/ACTG/; + } + print OUTPUT10 $forward."\n"; + print OUTPUT11 $reverse."\n"; + } + } + } + } + } + } + my $time_after_FOUND = time; + $time_for_FOUND += ($time_after_FOUND-$time_FOUND); + } + } + + %extensions = (%extensions1, %extensions2); + %extensions_original = %extensions; + %extensionsb = (%extensions1b, %extensions2b); + @extensions = (@extensions1, @extensions2); + + my $ext = '0'; + my $ext_total = '0'; + foreach (@extensions) + { + $ext++; + } + $ext_total = $ext; + + if ($y > $startprint2) + { + print OUTPUT5 "\n".$read_count ." READ_COUNT\n"; + print OUTPUT5 $read_ex ." READ_EX\n"; + print OUTPUT5 $ext ." EXTENSIONS\n"; + } + if ($count_coverage < 20) + { + $total_extensions += $ext; + $count_coverage++; + $average_coverage_ext = sprintf("%.0f",$total_extensions/$count_coverage); + print OUTPUT5 $average_coverage_ext ." AVERAGE_COVERAGE\n"; + } + + if ($y > $startprint2 && $benchmark_time eq "yes") + { + $time_collect_ext = time; + + if ($time_collect_ext-$time_end_hash_scan > 2) + { + print OUTPUT5 $time_collect_ext-$time_end_hash_scan." TIME2\n"; + print OUTPUT5 $time_for_NO_MATCH." TIME_FOR_NO_MATCH\n"; + print OUTPUT5 $time_for_FOUND." TIME_FOR_FOUND\n"; + } + print OUTPUT5 $time_test." TIME_TEST\n"; + } + + if (($y > $startprint && $print_log eq '2') || $hp_seed_assemble eq "fsf" || $y eq "vqsvg") + { + foreach my $matches (@matches) + { + my @matchesb; + undef @matchesb; + @matchesb = split /,/, $matches; + my $m_reverse = reverse($matchesb[3]); + $m_reverse =~ tr/ACTG/TGAC/; + my $mp_reverse = reverse($matchesb[4]); + $mp_reverse =~ tr/ACTG/TGAC/; + print OUTPUT5 $matchesb[1]."\n"; + } + } + + my $id_original = $id; + + my @extensions_group1; + my @extensions_group2; + my %extensions_group1; + my %extensions_group2; + my @extensions_group1_old; + my @extensions_group2_old; + my @extensions_group3_old; + my @extensions_group4_old; + undef @extensions_group1_old; + undef @extensions_group2_old; + undef @extensions_group3_old; + undef @extensions_group4_old; + undef @extensions_group1; + undef @extensions_group2; + undef @extensions_group1; + undef @extensions_group2; + my @extensions_group3; + my @extensions_group4; + my %extensions_group3; + my %extensions_group4; + undef @extensions_group3; + undef @extensions_group4; + undef %extensions_group3; + undef %extensions_group4; + + my $no_SNR = ""; + my %extensions_backup = %extensions; + my @extensions_backup = @extensions; + +SPLIT: + if ($split eq "yes") + { + @extensions = @extensions_group2; + %extensions = %extensions_group2; + $split = "yes2"; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + } + elsif ($split eq "yes2") + { + @extensions = @extensions_group3; + %extensions = %extensions_group3; + $split = "yes3"; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + } + elsif ($split eq "yes3") + { + @extensions = @extensions_group4; + %extensions = %extensions_group4; + $split = "yes4"; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + } + elsif ($split eq "yes4") + { + @extensions = @extensions_group1; + %extensions = %extensions_group1; + $split = "yes5"; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + if ($count_split eq '1') + { + $split = ""; + } + } + $id = $id_original; + $position = $position{$id}; + my $l = '0'; + my $best_extension = ""; + my $SNP = ""; + my $A_SNP = '0'; + my $C_SNP = '0'; + my $T_SNP = '0'; + my $G_SNP = '0'; + my $position_SNP = $position; + my $pos_SNP = '0'; + + my $A_SNP2 = '0'; + my $C_SNP2 = '0'; + my $T_SNP2 = '0'; + my $G_SNP2 = '0'; + my $position_SNP2 = $position; + my $pos_SNP2 = '0'; + + my $A_SNP3 = '0'; + my $C_SNP3 = '0'; + my $T_SNP3 = '0'; + my $G_SNP3 = '0'; + my $position_SNP3 = $position; + my $pos_SNP3 = '0'; + + my $A_SNP4 = '0'; + my $C_SNP4 = '0'; + my $T_SNP4 = '0'; + my $G_SNP4 = '0'; + my $position_SNP4 = $position; + my $pos_SNP4 = '0'; + + my %SNR_count; + my %extensions_new; + my @extensions_new; + undef %SNR_count; + undef %extensions_new; + undef @extensions_new; + my $SNR_test = ""; + my $most_SNR = '0'; + my $most_SNR2 = '0'; + + if ($SNR_read ne "" && $split eq "" && $SNR_read2 ne "" && $use_quality eq "") + { + $SNR_test = "yes2"; + if ($SNR_read eq "yes") + { + $SNR_test = "yes2"; + my $G = '0'; + my $G2 = '0'; + my $no_SNR1 = ""; + my $second_round; +SNR1: foreach my $extensions (@extensions) + { + my @chars = split//, $extensions; + my $e = '0'; + my $check_only_SNR = $extensions =~ s/$SNR_nucleo/$SNR_nucleo/g; + if (length($extensions) eq $check_only_SNR) + { + $G2++; + next SNR1; + } + $G++; + if ($second_round eq "") + { + $no_SNR1 = "yes"; + } + while ($SNR_nucleo eq $chars[$e] || $no_SNR1 eq "") + { + if ($SNR_nucleo ne $chars[$e]) + { + $no_SNR1 = "yes"; + } + $e++; + } + if ($e < length($extensions)) + { + $SNR_count{$extensions} = $e; + $SNR_length{$e} .= exists $SNR_length{$e} ? ",$extensions" : $extensions; + } + $no_SNR1 = ""; + } + my $SNR_length_count2 = '0'; + my $SNR_length_reads = ""; + my $first = ""; + foreach my $SNR_length (keys %SNR_length) + { + my $SNR_length_count = $SNR_length{$SNR_length} =~ tr/,/,/; + if ($SNR_length_count > $SNR_length_count2) + { + $SNR_length_count2 = $SNR_length_count; + $SNR_length_reads = $SNR_length{$SNR_length}; + $most_SNR = $SNR_length; + $first = $SNR_length_count; + } + } + my @SNR_length = split /,/, $SNR_length_reads; + my $repetitive_test = substr $read_short_end2, -10, 10; + my $SNR_checkSNR = $repetitive_test =~ s/$SNR_nucleo/$SNR_nucleo/g; + + $SNR_length_count2 = '0'; + foreach my $SNR_length (keys %SNR_length) + { + my $SNR_length_count = $SNR_length{$SNR_length} =~ tr/,/,/; + if ($SNR_length_count > $SNR_length_count2 && $SNR_length_count ne $first) + { + $SNR_length_count2 = $SNR_length_count; + $most_SNR2 = $SNR_length; + } + } + if ($y > $startprint2) + { + print OUTPUT5 $first." 1 ".$most_SNR." 2 ".$G." G\n"; + } + if ($first < 0.8*$G && $second_round eq "" && $first > 0.35*$G && $SNR_length_count2 > 0.35*$G) + { + $no_SNR = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "NO_SNR\n"; + } + goto NUCLEO0; + } + elsif ($first < 0.8*$G && $second_round eq "") + { + if ($y > $startprint2) + { + print OUTPUT5 $most_SNR." MOST_SNR SECOND ROUND\n"; + } + $second_round = "yes"; + $no_SNR1 = ""; + $most_SNR = '0'; + $G = '0'; + undef %SNR_count; + undef %SNR_length; + goto SNR1; + } + elsif ($first <= 0.35*$G && $second_round eq "jf") + { + if ($y > $startprint2) + { + print OUTPUT5 "SNR_NEXT_SEED1\n"; + } + $SNR_next_seed = "yes"; + $noforward{$id} = "stop"; + delete $seed{$id_split2}; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split3}; + $noforward = "stop"; + if ($y > $startprint2) + { + print OUTPUT5 "SNR_NEXT_SEED\n"; + } + goto BACK; + } + else + { + foreach my $SNRie (@SNR_length) + { + if (exists($extensions{$SNRie})) + { + $extensions_new{$SNRie} = $extensions{$SNRie}; + push @extensions_new, $SNRie; + } + } + %extensions = %extensions_new; + @extensions = @extensions_new; + } + } + if ($SNR{$id} eq "yes2_double") + { + $SNR_test = "yes2_double"; + foreach my $extensions (@extensions) + { + my @chars = split//, $extensions; + my $e = '0'; + if ($SNR_nucleo eq $chars[$e].$chars[$e+1] ) + { + while ($SNR_nucleo eq $chars[$e].$chars[$e+1]) + { + my $tempie = reverse $extensions; + chop $tempie; + chop $tempie; + $extensions = reverse $tempie; + $e++; + $e++; + } + } + else + { + while ($SNR_nucleo eq $chars[$e+1].$chars[$e]) + { + my $tempie = reverse $extensions; + chop $tempie; + chop $tempie; + $extensions = reverse $tempie; + $e++; + $e++; + } + } + $extensions_new{$extensions} = $extensions{$extensions}; + push @extensions_new, $extensions; + if ($e < length($extensions)) + { + $SNR_count{$extensions} = $e; + } + } + %extensions = %extensions_new; + @extensions = @extensions_new; + } + delete $SNR{$id}; + } +NUCLEO0: + if ($SNR_read ne "") + { + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + } + my $highest_all_freq = '0'; + my $extra_l = '0'; + my $ll = $read_length - ($overlap+$left-1) + $extra_l; + + +NUCLEO: while ($l < $ll && $l < 149) + { + my $A = '0'; + my $C = '0'; + my $T = '0'; + my $G = '0'; + my $skipped = '0'; + + if ($SNR_read2 ne "" && $l > 0 && $split eq "") + { + my $best_extension_tmp8 = $best_extension; + $best_extension_tmp8 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + print OUTPUT5 $best_extension_tmp8." TMP\n"; + my $last_nuc = substr $best_extension_tmp8, -1; + my $arrSize1 = @extensions; + if ($last_nuc ne '.' && $arrSize1 > 4) + { + my @extensions_tmp; + undef @extensions_tmp; + foreach my $extensions (@extensions) + { + my @chars = split//, $extensions; + if ($chars[$l-1] eq $last_nuc || length($extensions) < $l) + { + push @extensions_tmp, $extensions; + } + } + my $arrSize2 = @extensions_tmp; + + if ($arrSize1 ne $arrSize2) + { + undef @extensions; + @extensions = @extensions_tmp; + + my $best_extension_dot = $best_extension_tmp8 =~ tr/\./\./; + if ($best_extension_dot > 0) + { + $l = 0; + $SNP = ""; + $best_extension = ""; + goto NUCLEO0; + } + } + } + } + + foreach my $extensions (@extensions) + { + my @chars = split//, $extensions; + + if ($chars[$l] eq "A") + { + $A++; + } + elsif ($chars[$l] eq "C") + { + $C++; + } + elsif ($chars[$l] eq "T") + { + $T++; + } + elsif ($chars[$l] eq "G") + { + $G++; + } + elsif ($chars[$l] ne "") + { + $skipped++; + } + } + my $c = '2.8'; + my $q = '2'; + + if ($ext > 22 && $SNR_read eq "") + { + $c = '3.7'; + } + if ($SNR_read2 ne "" && $l < 5) + { + $c = '1.4'; + } + if ($ext > 6 && $SNR_read eq "" && ($type ne "chloro" || $extensions_before ne "")) + { + $c = '5'; + } + if ($ext > 22 && $SNR_read eq "" && $type ne "chloro") + { + $c = '6.5'; + } + if ($ext > 38 && $SNR_read eq "" && $type ne "chloro") + { + $c = '8.4'; + } + if ($ext > 100 && $SNR_read eq "" && $type eq "mito_plant") + { + $c = '13'; + } + if ($ext > $average_coverage_ext*4 && $SNR_read eq "" && $type eq "mito_plant") + { + $c = '25'; + } + if ($repetitive_detect ne "" && $ext < 23 && $SNR_read eq "") + { + $c = '7'; + } + if ($repetitive_detect ne "" && $ext >= 23 && $SNR_read eq "") + { + $c = '9'; + } + if ($repetitive_detect2 eq "yes" || ($repetitive_detect ne "" && $ext > 150)) + { + $c = '15'; + } + if ($extensions_before eq "yes" && $c < 6.3) + { + $c = '6.3'; + } + if ($extensions_before eq "yes" && $type eq "mito_plant") + { + $c = '13'; + } + if ($extensions_before eq "yes" && $ext > $average_coverage_ext*0.5 && $type eq "mito_plant") + { + $c = '23'; + } + if ($extensions_before eq "yes" && $ext > $average_coverage_ext && $type eq "mito_plant") + { + $c = '35'; + } + if ($type eq "mito_plant") + { + $c += 2; + } + my $v = '6'; + my $s = '3'; + my $z = '1'; + if ($split ne "") + { + $v = '10'; + $z = '0'; + $s = '2'; + } + my $dup = ""; + my $r = 4; + if ($type eq "mito_plant" && $ext > $average_coverage_ext*3) + { + $dup = "yes"; + if ($extensions_before eq "yes") + { + $r = 9; + } + } + if ($type eq "mito_plant" && $ext > $average_coverage_ext*5) + { + $dup = "yes"; + if ($extensions_before eq "yes") + { + $r = 12; + } + } + my $hp = 10000000000; + if ($heteroplasmy ne "" && $SNR_read eq "" && $repetitive_detect eq "" && $hp_seed_assemble eq "") + { + $hp = ($A + $T + $G + $C)*$heteroplasmy; + $q = 1.5; + } + if ($hp_seed_assemble ne "") + { + $v = '40'; + $q = '1.4'; + if ($c < 5 && $SNR_read eq "") + { + $c = '5'; + } + } + if ($heteroplasmy ne "" && $SNR_read eq "" && $SNP eq "" && $l > 7 && $highest_all_freq > $heteroplasmy/2) + { + chop($best_extension); + last NUCLEO; + } + if ($A > ($C + $T + $G)*$c && (($C <= $hp && $T <= $hp && $G <= $hp) || ($C < 2 && $T < 2 && $G < 2)) && (($A > $s && ($ext)/($A+$T+$G+$C+$skipped) < $q) || ($A > $z && $l < $v && ($C + $T + $G) eq 0)) && ($dup ne "yes" || ($C + $T + $G) < $average_coverage_ext/$r)) + { + $best_extension = $best_extension."A"; + $allele_percentage{$l} = $A."+".$C."+".$T."+".$G; + $allele_total{$l} = $A+$C+$T+$G; + $highest_all_freq = ($C+$T+$G)/($A+$C+$T+$G); + } + elsif ($C > ($A + $T + $G)*$c && (($A <= $hp && $T <= $hp && $G <= $hp) || ($A < 2 && $T < 2 && $G < 2)) && (($C > $s && ($ext)/($A+$T+$G+$C+$skipped) < $q) || ($C > $z && $l < $v && ($A + $T + $G) eq 0)) && ($dup ne "yes" || ($A + $T + $G) < $average_coverage_ext/$r)) + { + $best_extension = $best_extension."C"; + $allele_percentage{$l} = $A."+".$C."+".$T."+".$G; + $allele_total{$l} = $A+$C+$T+$G; + $highest_all_freq = ($A+$T+$G)/($A+$C+$T+$G); + } + elsif ($T > ($A + $C + $G)*$c && (($C <= $hp && $A <= $hp && $G <= $hp) || ($A < 2 && $C < 2 && $G < 2)) && (($T > $s && ($ext)/($A+$T+$G+$C+$skipped) < $q) || ($T > $z && $l < $v && ($C + $A + $G) eq 0)) && ($dup ne "yes" || ($C + $A + $G) < $average_coverage_ext/$r)) + { + $best_extension = $best_extension."T"; + $allele_percentage{$l} = $A."+".$C."+".$T."+".$G; + $allele_total{$l} = $A+$C+$T+$G; + $highest_all_freq = ($A+$C+$G)/($A+$C+$T+$G); + } + elsif ($G > ($C + $T + $A)*$c && (($C <= $hp && $T <= $hp && $A <= $hp) || ($C < 2 && $T < 2 && $A < 2)) && (($G > $s && ($ext)/($A+$T+$G+$C+$skipped) < $q) || ($G > $z && $l < $v && ($C + $T + $A) eq 0)) && ($dup ne "yes" || ($C + $T + $A) < $average_coverage_ext/$r)) + { + $best_extension = $best_extension."G"; + $allele_percentage{$l} = $A."+".$C."+".$T."+".$G; + $allele_total{$l} = $A+$C+$T+$G; + $highest_all_freq = ($A+$C+$T)/($A+$C+$T+$G); + } + elsif ($hp_seed_assemble ne "" && $SNR_read ne "" && $l > 5) + { + last NUCLEO; + } + elsif ((($heteroplasmy ne "" && $l eq '0') || $SNP_active eq "yes" || ($SNR_read ne "" && $l > 0) || ($extensions_before eq "yes" && $ext_before ne "yes")) && $SNP eq "" && ($A + $T + $G + $C) > 4 && (($l < 15 && $split eq "") || ($l < 11 && $split ne "")) && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + delete $SNP_active{$id}; + $SNP = "yes"; + $A_SNP = $A; + $C_SNP = $C; + $T_SNP = $T; + $G_SNP = $G; + $position_SNP += $l; + $pos_SNP = $l; + + my @IUPAC = IUPAC($A,$C,$T,$G); + $allele_percentage{$l} = $A."+".$C."+".$T."+".$G; + $best_extension = $best_extension.$IUPAC[0]; + + $allele_total{$l} = $A+$C+$T+$G; + } + elsif ($hp_seed_assemble ne "" && $PCR_free ne "yes") + { + last NUCLEO; + } + elsif ((($heteroplasmy ne "" && $l < $pos_SNP+10) || $heteroplasmy eq "") && $SNP eq "yes" && ($A + $T + $G + $C) > 4 && $l < 15 && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + if ($heteroplasmy ne "" && $SNR_read2 eq "") + { + my %remove_extension_mismatch_tmp = mismatch (\%extensions, \%remove_extension_mismatch, $best_extension); + %remove_extension_mismatch = (%remove_extension_mismatch, %remove_extension_mismatch_tmp) + } + $SNP = "yes2"; + $A_SNP2 = $A; + $C_SNP2 = $C; + $T_SNP2 = $T; + $G_SNP2 = $G; + $position_SNP2 += $l; + $pos_SNP2 = $l; + + my @IUPAC = IUPAC($A,$C,$T,$G); + $allele_percentage{$l} = $A."+".$C."+".$T."+".$G; + $best_extension = $best_extension.$IUPAC[0]; + + $allele_total{$l} = $A+$C+$T+$G; + } + elsif ((($heteroplasmy ne "" && $l < $pos_SNP2+10) || $heteroplasmy eq "") && $SNP eq "yes2" && ($A + $T + $G + $C) > 4 && $l < 15 && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + if ($heteroplasmy ne "" && $SNR_read2 eq "") + { + my %remove_extension_mismatch_tmp = mismatch (\%extensions, \%remove_extension_mismatch, $best_extension); + %remove_extension_mismatch = (%remove_extension_mismatch, %remove_extension_mismatch_tmp) + } + $SNP = "yes3"; + $A_SNP3 = $A; + $C_SNP3 = $C; + $T_SNP3 = $T; + $G_SNP3 = $G; + $position_SNP3 += $l; + $pos_SNP3 = $l; + + my @IUPAC = IUPAC($A,$C,$T,$G); + $allele_percentage{$l} = $A."+".$C."+".$T."+".$G; + $best_extension = $best_extension.$IUPAC[0]; + + $allele_total{$l} = $A+$C+$T+$G; + } + elsif ((($heteroplasmy ne "" && $l < $pos_SNP3+10) || $heteroplasmy eq "") && $SNP eq "yes3" && ($A + $T + $G + $C) > 4 && $l < 15 && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + if ($heteroplasmy ne "" && $SNR_read2 eq "") + { + my %remove_extension_mismatch_tmp = mismatch (\%extensions, \%remove_extension_mismatch, $best_extension); + %remove_extension_mismatch = (%remove_extension_mismatch, %remove_extension_mismatch_tmp) + } + $SNP = "yes4"; + $A_SNP4 = $A; + $C_SNP4 = $C; + $T_SNP4 = $T; + $G_SNP4 = $G; + $position_SNP4 += $l; + $pos_SNP4 = $l; + + my @IUPAC = IUPAC($A,$C,$T,$G); + $allele_percentage{$l} = $A."+".$C."+".$T."+".$G; + $best_extension = $best_extension.$IUPAC[0]; + + $allele_total{$l} = $A+$C+$T+$G; + } + elsif ((($heteroplasmy ne "" && $l < $pos_SNP4+10) || $heteroplasmy eq "") && $SNP eq "yes4" && ($pos_SNP ne 0 || ($pos_SNP4 > $pos_SNP+12) || ($extensions_before eq "yes" && $l > 12)) && $split eq "" && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + $SNP = "yes5"; + my $g = $l; + my $pos_SNP_tmp = $pos_SNP; + if ($pos_SNP4 > $pos_SNP+12) + { + $pos_SNP_tmp = $pos_SNP4; + } + if ($extensions_before ne "yes" && $pos_SNP ne 0 && $SNR_read2 eq "") + { + while ($g > $pos_SNP_tmp) + { + chop($best_extension); + $g--; + } + } + last NUCLEO; + } + + elsif ((($SNP eq "yes4" && $pos_SNP eq 0 && $l <= 15) || ($indel_split_skip ne "yes" && $l eq 0 && $ext > 4)) && ($A + $T + $G + $C) > 4 && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + print OUTPUT5 $SNP." SNP\n"; + if ($y > $startprint2) + { + print OUTPUT5 $SNP." SNP\n"; + } + if ($SNR_test ne "" && $no_SNR ne "yes") + { + $l = '0'; + $best_extension = ""; + $no_SNR = "yes"; + %extensions = %extensions_backup; + @extensions = @extensions_backup; + + goto SPLIT; + } + if ($SNP eq "") + { + $A_SNP = $A; + $C_SNP = $C; + $T_SNP = $T; + $G_SNP = $G; + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSIONll\n"; + print OUTPUT5 $A_SNP." A\n"; + print OUTPUT5 $C_SNP." C\n"; + print OUTPUT5 $T_SNP." T\n"; + print OUTPUT5 $G_SNP." G\n"; + } + } + $best_extension = ""; + $split = "yes"; + undef @firstSNP_max; + my $w = 0.035; + my $u = 0; + if ($type eq "mito_plant") + { + $w = 0.015; + } + if ($type eq "mito_plant" && $ext > $average_coverage_ext*5 && $extensions_before eq "yes") + { + $w = 0.005; + } + if ($type eq "mito_plant" && $ext > $average_coverage_ext*5 && $extensions_before eq "yes") + { + $w = 0.005; + } + if ($average_coverage_ext > 15 && $extensions_before ne "yes") + { + $u = 1; + } + + if ($A_SNP >= ($C_SNP+$A_SNP+$T_SNP+$G_SNP)*$w && $A_SNP > $u) + { + if (exists($yuyu_option{$id.'A'})) + { + if ($y > $startprint2) + { + print OUTPUT5 "YUYU_A_EXISTS\n"; + } + } + else + { + push @firstSNP_max, "A"; + } + } + if ($C_SNP >= ($C_SNP+$A_SNP+$T_SNP+$G_SNP)*$w && $C_SNP > $u) + { + if (exists($yuyu_option{$id.'C'})) + { + if ($y > $startprint2) + { + print OUTPUT5 "YUYU_C_EXISTS\n"; + } + } + else + { + push @firstSNP_max, "C"; + } + } + if ($T_SNP >= ($C_SNP+$A_SNP+$T_SNP+$G_SNP)*$w && $T_SNP > $u) + { + if (exists($yuyu_option{$id.'T'})) + { + if ($y > $startprint2) + { + print OUTPUT5 "YUYU_T_EXISTS\n"; + } + } + else + { + push @firstSNP_max, "T"; + } + } + if ($G_SNP >= ($C_SNP+$A_SNP+$T_SNP+$G_SNP)*$w && $G_SNP > $u) + { + if (exists($yuyu_option{$id.'G'})) + { + if ($y > $startprint2) + { + print OUTPUT5 "YUYU_G_EXISTS\n"; + } + } + else + { + push @firstSNP_max, "G"; + } + } + + $count_split = @firstSNP_max; + my $count_split_tmp = '0'; + + if ($count_split eq '2') + { + $delete_third = "yes"; + $delete_second = "yes"; + } + if ($count_split eq '3') + { + $delete_third = "yes"; + } + if ($count_split eq '1') + { + $delete_third = "yes"; + $delete_first = "yes"; + $delete_second = "yes"; + $split = "yes4"; + } + + foreach my $firstSNP_max (@firstSNP_max) + { + foreach my $extensions_tmp (@extensions) + { + my @chars = split//, $extensions_tmp; + if ($chars[0] eq $firstSNP_max && $count_split_tmp eq '0') + { + $extensions_group1{$extensions_tmp} = $extensions{$extensions_tmp}; + push @extensions_group1, $extensions_tmp; + } + elsif ($chars[0] eq $firstSNP_max && $count_split_tmp eq '1') + { + $extensions_group2{$extensions_tmp} = $extensions{$extensions_tmp}; + push @extensions_group2, $extensions_tmp; + } + elsif ($chars[0] eq $firstSNP_max && $count_split_tmp eq '2') + { + $extensions_group3{$extensions_tmp} = $extensions{$extensions_tmp}; + push @extensions_group3, $extensions_tmp; + } + elsif ($chars[0] eq $firstSNP_max && $count_split_tmp eq '3') + { + $extensions_group4{$extensions_tmp} = $extensions{$extensions_tmp}; + push @extensions_group4, $extensions_tmp; + } + } + $count_split_tmp++; + } + if ($y > $startprint2) + { + print OUTPUT5 $count_split." COUNT_SPLIT\n"; + } + goto SPLIT; + } + else + { + last NUCLEO; + } + $l++; + } + + if($split eq "" && $rep_detect2 eq "yes") + { + + } + if($split eq "" && $SNP ne "" && $extensions_before ne "" && $pos_SNP eq '0') + { + my $ext_tmp = $best_extension; + my $count_N = $ext_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + if ($count_N > length($best_extension)/2) + { + $best_extension = ""; + } + } + if ($SNP eq "yes4" && $split eq "" && length($best_extension) < 15 && $pos_SNP ne '0') + { + $SNP = "yes5"; + my $g = $l; + my $pos_SNP_tmp = $pos_SNP; + + while ($g > $pos_SNP_tmp) + { + chop($best_extension); + $g--; + } + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSION_chopped\n"; + } + } + my $last_nucleo = substr $best_extension, -1; + $last_nucleo =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + while ($last_nucleo eq '.') + { + chop($best_extension); + $last_nucleo = substr $best_extension, -1; + $last_nucleo =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + } + if ($split eq "yes2" || ($split eq "yes3" && $count_split > 2) || ($split eq "yes4" && $count_split > 3)) + { + if ($split eq "yes2") + { + $best_extension2 = $best_extension; + } + elsif ($split eq "yes3") + { + $best_extension3 = $best_extension; + } + elsif ($split eq "yes4") + { + $best_extension4 = $best_extension; + } + my $best_extension_tmp = $best_extension; + $best_extension_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + my $contig_id2_prev = $id; + + if ($split eq "yes2") + { + $contig_id2 = "2".length($read); + $contig_id2{$contig_id2_prev} = $contig_id2; + $contig_id2_prev = $contig_id2; + $contig_read2 = substr $read, -$read_length; + $contig_read2 = $contig_read2.$best_extension_tmp; + } + elsif ($split eq "yes3") + { + $contig_id3 = "3".length($read); + $contig_id3{$contig_id2_prev} = $contig_id3; + $contig_id2_prev = $contig_id3; + $contig_read3 = substr $read_short_end2, -$read_length; + $contig_read3 = $contig_read3.$best_extension_tmp; + } + elsif ($split eq "yes4") + { + $contig_id4 = "4".length($read); + $contig_id4{$contig_id2_prev} = $contig_id4; + $contig_id2_prev = $contig_id4; + $contig_read4 = substr $read_short_end2, -$read_length; + $contig_read4 = $contig_read4.$best_extension_tmp; + } + + if ($y > $startprint2) + { + if ($split eq "yes2") + { + print OUTPUT5 "GROUP2\n"; + if ($extensions_before ne "yes") + { + @extensions_group2_old = @extensions_group2; + } + foreach my $extensions_tmp (@extensions_group2) + { + print OUTPUT5 $extensions_tmp."\n"; + } + print OUTPUT5 $best_extension_tmp." BEST_EXTENSION2\n\n"; + } + elsif ($split eq "yes3") + { + print OUTPUT5 "GROUP3\n"; + if ($extensions_before ne "yes") + { + @extensions_group3_old = @extensions_group3; + } + foreach my $extensions_tmp (@extensions_group3) + { + print OUTPUT5 $extensions_tmp."\n"; + } + print OUTPUT5 $best_extension_tmp." BEST_EXTENSION3\n\n"; + } + elsif ($split eq "yes4") + { + print OUTPUT5 "GROUP4\n"; + if ($extensions_before ne "yes") + { + @extensions_group4_old = @extensions_group4; + } + foreach my $extensions_tmp (@extensions_group4) + { + print OUTPUT5 $extensions_tmp."\n"; + } + print OUTPUT5 $best_extension_tmp." BEST_EXTENSION4\n\n"; + } + } + if ($before eq "yes") + { + if ((length($best_extension2) < 3 || (length($best_extension2) < 6 && ($ext > 15 || $SNR_read ne ""))) && $repetitive_detect eq "" && $before eq "yesss") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION2/3/4\n\n"; + } + if ($split eq "yes2") + { + $delete_first = "yes"; + if ($count_split eq '2') + { + $split = "yes4"; + } + } + if ($split eq "yes3") + { + $delete_second = "yes"; + if ($count_split eq '3') + { + $split = "yes4"; + } + } + if ($split eq "yes4") + { + $delete_third = "yes"; + } + goto SPLIT; + } + if ($type eq "chloro222" && length($best_extension2) > 20) + { + my $best_extension2_reverse2 = $best_extension2; + $best_extension2_reverse2 =~ tr/ATCG/TAGC/; + my $best_extension2_reverse = reverse($best_extension2_reverse2); + + $best_extension2_reverse =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + my $read_cp = $read; + my $read_end_rev_tmp = reverse($read_end); + $read_end_rev_tmp =~ tr/ATCG/TAGC/; + if (length($read) > $genome_range_low) + { + $read_cp = substr $read, 5000; + $read_cp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + } + + my $found_seq_cp = $read_cp =~ s/$best_extension2_reverse/$best_extension2_reverse/; + my $found_seq_cp3 = $read_cp =~ s/$read_end_rev_tmp/$read_end_rev_tmp/; + + if ($found_seq_cp > 0 && $found_seq_cp3 > 0) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 2 (CP)\n\n"; + } + if ($split eq "yes2") + { + $delete_first = "yes"; + if ($count_split eq '2') + { + $split = "yes4"; + } + } + if ($split eq "yes3") + { + $delete_second = "yes"; + if ($count_split eq '3') + { + $split = "yes4"; + } + } + if ($split eq "yes4") + { + $delete_third = "yes"; + } + goto SPLIT; + } + } + my $end_SNR = substr $read_end, -4; + my $GGGG = $end_SNR =~ tr/G/G/; + my $TTTT = $end_SNR =~ tr/T/T/; + my $CCCC = $end_SNR =~ tr/C/C/; + my $AAAA = $end_SNR =~ tr/A/A/; + if ($GGGG eq '7' || $TTTT eq '7' || $CCCC eq '7' || $AAAA eq '7') + { + $GGGG = $best_extension2 =~ tr/G/G/; + $TTTT = $best_extension2 =~ tr/T/T/; + $CCCC = $best_extension2 =~ tr/C/C/; + $AAAA = $best_extension2 =~ tr/A/A/; + if ($GGGG eq length($best_extension2) || $TTTT eq length($best_extension2) || $CCCC eq length($best_extension2) || $AAAA eq length($best_extension2)) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 2/3/4 (SNR)\n\n"; + } + if ($split eq "yes2") + { + $delete_first = "yes"; + if ($count_split eq '2') + { + $split = "yes4"; + } + } + if ($split eq "yes3") + { + $delete_second = "yes"; + if ($count_split eq '3') + { + $split = "yes4"; + } + } + if ($split eq "yes4") + { + $delete_third = "yes"; + } + goto SPLIT; + } + } + + my $contigs_end2 = substr $best_extension2, 0, 7; + $contigs_end2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $contigs_end0 = substr $read_end, -5; + + if (exists($contigs_end{$contigs_end0.$contigs_end2})) + { + my $id_tmp = $id; + if ($id =~ m/.*_(\d+)$/) + { + $id_tmp = $1; + } + $tree{$id_tmp} = $contigs_end{$contigs_end0.$contigs_end2}; + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 2 (CONTIG_END)\n\n"; + } + $contig_end_check ="yes"; + if ($split eq "yes2") + { + $delete_first = "yes"; + if ($count_split eq '2') + { + $split = "yes4"; + } + } + if ($split eq "yes3") + { + $delete_second = "yes"; + if ($count_split eq '3') + { + $split = "yes4"; + } + } + if ($split eq "yes4") + { + $delete_third = "yes"; + } + goto SPLIT; + } + + my $hasdot = $best_extension_tmp =~ tr/\./\./; + if (length($best_extension_tmp) > 9 && $hasdot < 2 && $hasdot eq "yesssssjeij") + { + my $end_tmp = substr $read_end, 10; + if (length($best_extension_tmp) < 15) + { + $end_tmp = substr $read_end, length($best_extension_tmp)-5; + } + $end_tmp = $end_tmp.$best_extension_tmp; + my $s = '0'; + my $foundit = ""; + while ($s < length($end_tmp)-$overlap) + { + my $end_tmp_d = substr $end_tmp, -($s+$overlap), $overlap; + + if ($containX_short_end2 > 0) + { + my $star = $end_tmp_d =~ tr/\*//; + + $end_tmp_d = substr $end_tmp, -($s+$overlap+($star*2)), $overlap+($star*2); + my $star2 = $end_tmp_d =~ tr/\*//; + while ($star2 > $star) + { + $end_tmp_d = substr $end_tmp, -($s+$overlap+($star*2)), $overlap+($star*2); + $star = $star2; + $star2 = $end_tmp_d =~ tr/\*//; + } + } + my %end_tmp_d = build_partial3b $end_tmp_d; + foreach my $end_tmp_d (keys %end_tmp_d) + { + if (exists($hash2b{$end_tmp_d})) + { + $foundit = "yes"; + } + elsif (exists($hash2c{$end_tmp_d})) + { + $foundit = "yes"; + } + } + $s++; + } + if ($foundit ne "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 2/3/4 (no reverse match)\n\n"; + } + if ($split eq "yes2") + { + $delete_first = "yes"; + if ($count_split eq '2') + { + $split = "yes4"; + } + } + if ($split eq "yes3") + { + $delete_second = "yes"; + if ($count_split eq '3') + { + $split = "yes4"; + } + } + if ($split eq "yes4") + { + $delete_third = "yes"; + } + goto SPLIT; + + } + } + } + if ($split eq "yes2") + { + delete $seed{$id}; + $id = "c".$position."_".$id; + $id_split1 = $id; + } + elsif ($split eq "yes3") + { + $id = "d".$position."_".$id; + $id_split2 = $id; + } + elsif ($split eq "yes4") + { + $id = "e".$position."_".$id; + $id_split3 = $id; + } + $position{$id} = $position; + $position_back{$id} = $position_back; + $contig_count{$id} = $contig_count{$id_original}; + my $count_contig_tmp = $contig_count; + while ($count_contig_tmp > 0) + { + $contig_gap_min{$id."_".$count_contig_tmp} = $contig_gap_min{$id_original."_".$count_contig_tmp}; + $contig_gap_max{$id."_".$count_contig_tmp} = $contig_gap_max{$id_original."_".$count_contig_tmp}; + $count_contig_tmp--; + } + if (exists($noback{$id_original})) + { + $noback{$id} = $noback; + } + if (exists($old_id{$id_original})) + { + $old_id{$id} = $old_id{$id_original}; + } + if (exists($old_rep{$id_original})) + { + $old_rep{$id} = $old_rep{$id_original}; + } + if (exists($old_rep_old{$id_original})) + { + $old_rep_old{$id} = $old_rep_old{$id_original}; + } + if (exists($nosecond{$id_original})) + { + $nosecond{$id} = undef; + } + if (exists($seed_split{$id_original})) + { + $seed_split{$id} = undef; + } + if (exists($rep_return{$id_original})) + { + $rep_return{$id} = $rep_return{$id_original}; + } + if (exists($rep_return_back{$id_original})) + { + $rep_return_back{$id} = $rep_return_back{$id_original}; + } + if (exists($last_ref_seq_forward{$id_original})) + { + $last_ref_seq_forward{$id} = $last_ref_seq_forward{$id_original}; + } + if (exists($last_ref_seq_back{$id_original})) + { + $last_ref_seq_back{$id} = $last_ref_seq_back{$id_original}; + } + if (exists($last_pos_seq_forward{$id_original})) + { + $last_pos_seq_forward{$id} = $last_pos_seq_forward{$id_original}; + } + if (exists($last_pos_seq_back{$id_original})) + { + $last_pos_seq_back{$id} = $last_pos_seq_back{$id_original}; + } + if (exists($large_variance_forward{$id_original})) + { + $large_variance_forward{$id} = $large_variance_forward{$id_original}; + } + if (exists($large_variance_back{$id_original})) + { + $large_variance_back{$id} = $large_variance_back{$id_original}; + } + if (exists($large_variance_length_forward{$id_original})) + { + $large_variance_length_forward{$id} = $large_variance_length_forward{$id_original}; + } + if (exists($large_variance_length_back{$id_original})) + { + $large_variance_length_back{$id} = $large_variance_length_back{$id_original}; + } + if (exists($last_ref_pos_forward{$id_original})) + { + $last_ref_pos_forward{$id} = $last_ref_pos_forward{$id_original}; + } + if (exists($last_ref_pos_back{$id_original})) + { + $last_ref_pos_back{$id} = $last_ref_pos_back{$id_original}; + } + if (exists($no_next_seed{$id_original})) + { + $no_next_seed{$id} = $no_next_seed{$id_original}; + } + if (exists($jump_rep{$id_original})) + { + $jump_rep{$id} = undef; + } + if (exists($jump_rep_because_stuck{$id_original})) + { + $jump_rep_because_stuck{$id} = undef; + } + } + elsif ($split eq "yes5" || (($variance_detection eq "yes" || $heteroplasmy ne "") && $best_extension ne "" && $repetitive_detect eq "" && $last_150 eq "") || $reference_next_seed eq "yes") + { + if ($split eq "" && $repetitive_detect eq "" && ($variance_detection eq "yes" || $reference_next_seed eq "yes" || $heteroplasmy ne "")) + { + goto REFERENCE; + } + $best_extension1 = $best_extension; + + + my $contig_id1_prev = $id; + my $best_extension1_tmp = $best_extension1; + $best_extension1_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + $contig_id1 = "1".length($read); + $contig_id1{$contig_id1_prev} = $contig_id1; + $contig_id1_prev = $contig_id1; + $contig_read1 = substr $read, -$insert_size; + $contig_read1 = $contig_read1.$best_extension1_tmp; + + $contig_id1 = $contig_id1{$id}; + + if ($y > $startprint2) + { + print OUTPUT5 "GROUP1\n"; + if ($extensions_before ne "yes") + { + @extensions_group1_old = @extensions_group1; + } + foreach my $extensions_tmp (@extensions_group1) + { + print OUTPUT5 $extensions_tmp."\n"; + } + print OUTPUT5 $best_extension1." BEST_EXTENSION1\n\n"; + } + my $contigs_end0; + if ($before eq "yes") + { + if ((length($best_extension1) < 3 || (length($best_extension1) < 6 && ($ext > 15 || $SNR_read ne ""))) && $repetitive_detect eq "" && $repetitive_detect eq "yesff") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION1\n\n"; + } + delete $seed{$id}; + if ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes" && $last_chance eq "yes" && $noback eq "stop") + { + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto FINISH; + } + elsif ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes") + { + $best_extension = ""; + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto AFTER_EXT; + } + elsif (($delete_first eq "yes" && $delete_second eq "yes") || ($delete_first eq "yes" && $delete_third eq "yes") || ($delete_second eq "yes" && $delete_third eq "yes")) + { + goto SEED; + } + else + { + $best_extension = ""; + } + } + + if ($type eq "chloro222" && length($best_extension1) > 20 ) + { + my $read_cp = $read; + my $read_end_rev_tmpb = $read_end.$best_extension1; + my $read_end_rev_tmp = reverse($read_end_rev_tmpb); + $read_end_rev_tmp =~ tr/ATCG/TAGC/; + $read_end_rev_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + if (length($read) > 80000) + { + $read_cp = substr $read, -80000; + } + + my $found_seq_cp3 = $read_cp =~ s/$read_end_rev_tmp/$read_end_rev_tmp/; + + if (($found_seq_cp3 > 0) && $before eq "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION1 (CP)\n\n"; + } + delete $seed{$id}; + if ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes" && $last_chance eq "yes" && $noback eq "stop") + { + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto FINISH; + } + elsif ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes") + { + $best_extension = ""; + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto AFTER_EXT; + } + elsif (($delete_first eq "yes" && $delete_second eq "yes") || ($delete_first eq "yes" && $delete_third eq "yes") || ($delete_second eq "yes" && $delete_third eq "yes")) + { + goto SEED; + } + else + { + $best_extension = ""; + } + } + } + my $end_SNR = substr $read_end, -4; + my $GGGG = $end_SNR =~ tr/G/G/; + my $TTTT = $end_SNR =~ tr/T/T/; + my $CCCC = $end_SNR =~ tr/C/C/; + my $AAAA = $end_SNR =~ tr/A/A/; + if ($GGGG eq '7' || $TTTT eq '7' || $CCCC eq '7' || $AAAA eq '7' && $before eq "yessss") + { + $GGGG = $best_extension1 =~ tr/G/G/; + $TTTT = $best_extension1 =~ tr/T/T/; + $CCCC = $best_extension1 =~ tr/C/C/; + $AAAA = $best_extension1 =~ tr/A/A/; + if ($GGGG eq length($best_extension1) || $TTTT eq length($best_extension1) || $CCCC eq length($best_extension1) || $AAAA eq length($best_extension1)) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 1 (SNR)\n\n"; + } + delete $seed{$id}; + if ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes" && $last_chance eq "yes" && $noback eq "stop") + { + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto FINISH; + } + elsif ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes") + { + $best_extension = ""; + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto AFTER_EXT; + } + elsif (($delete_first eq "yes" && $delete_second eq "yes") || ($delete_first eq "yes" && $delete_third eq "yes") || ($delete_second eq "yes" && $delete_third eq "yes")) + { + goto SEED; + } + else + { + $best_extension = ""; + } + } + } + + my $contigs_end1 = substr $best_extension1, 0, 7; + $contigs_end1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $contigs_end0 = substr $read_end, -5; + + if (exists($contigs_end{$contigs_end0.$contigs_end1})) + { + $tree{$id} = $contigs_end{$contigs_end0.$contigs_end1}; + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 1 (CONTIG_END)\n\n"; + } + my $contigs_end1 = substr $best_extension1, 0, 10; + $contigs_end1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $contigs_end0 = substr $read_end, -15; + my $repetitive_test = $contigs_end0.$contigs_end1; + my $end_repetitive = $read_short_end2; + my $check_repetitive = $end_repetitive =~ s/.$repetitive_test/$repetitive_test/g; + if ($check_repetitive > 1) + { + $delete_first = "yes"; + $delete_second = "yes"; + $delete_third = "yes"; + } + if ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes") + { + $contig_end = "yes"; + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto INDEL; + } + elsif (($delete_first eq "yes" && $delete_second eq "yes") || ($delete_first eq "yes" && $delete_third eq "yes") || ($delete_second eq "yes" && $delete_third eq "yes")) + { + delete $seed{$id}; + goto SEED; + } + else + { + delete $seed{$id}; + $best_extension = ""; + } + } + + my $hasdot = $best_extension1 =~ tr/\./\./; + if (length($best_extension1) > 9 && $hasdot < 2 && $before eq "yesss") + { + my $end_tmp = substr $read_end, 10; + if (length($best_extension1) < 15) + { + $end_tmp = substr $read_end, length($best_extension1)-5; + } + $end_tmp = $end_tmp.$best_extension1; + my $s = '0'; + my $foundit = ""; + while ($s < length($end_tmp)-$overlap) + { + my $end_tmp_d = substr $end_tmp, -($s+$overlap), $overlap; + + if ($containX_short_end2 > 0) + { + my $star = $end_tmp_d =~ tr/\*//; + + $end_tmp_d = substr $end_tmp, -($s+$overlap+($star*2)), $overlap+($star*2); + my $star2 = $end_tmp_d =~ tr/\*//; + while ($star2 > $star) + { + $end_tmp_d = substr $end_tmp, -($s+$overlap+($star*2)), $overlap+($star*2); + $star = $star2; + $star2 = $end_tmp_d =~ tr/\*//; + } + } + my %end_tmp_d = build_partial3b $end_tmp_d; + foreach my $end_tmp_d (keys %end_tmp_d) + { + if (exists($hash2b{$end_tmp_d})) + { + $foundit = "yes"; + } + elsif (exists($hash2c{$end_tmp_d})) + { + $foundit = "yes"; + } + } + $s++; + } + if ($foundit ne "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 1 (no reverse match)\n\n"; + } + delete $seed{$id}; + if ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes" && $last_chance eq "yes" && $noback eq "stop") + { + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto FINISH; + } + elsif ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes") + { + $best_extension = ""; + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto AFTER_EXT; + } + elsif (($delete_first eq "yes" && $delete_second eq "yes") || ($delete_first eq "yes" && $delete_third eq "yes") || ($delete_second eq "yes" && $delete_third eq "yes")) + { + goto SEED; + } + else + { + $best_extension = ""; + } + } + } + if ($repetitive_detect ne "" && $contig_end_check eq "yes") + { + my $contigs_end1 = substr $best_extension1, 0, 10; + $contigs_end1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $contigs_end0 = substr $read_end, -15; + my $repetitive_test = $contigs_end0.$contigs_end1; + my $end_repetitive = $read_short_end2; + my $check_repetitive = $end_repetitive =~ s/.$repetitive_test/$repetitive_test/g; + if ($check_repetitive > 1) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 1 (CONTIG_END+REPETITIVE)\n\n"; + } + if ($delete_first eq "yes" && $delete_second eq "yes" && $delete_third eq "yes") + { + $delete_first = "yes2"; + $delete_second = "yes2"; + $delete_third = "yes2"; + goto INDEL; + } + elsif (($delete_first eq "yes" && $delete_second eq "yes") || ($delete_first eq "yes" && $delete_third eq "yes") || ($delete_second eq "yes" && $delete_third eq "yes")) + { + delete $seed{$id}; + goto SEED; + } + else + { + delete $seed{$id}; + $best_extension = ""; + } + } + } + } + + my $read_part_tmp = substr $read_short_end2, -$read_length; + my $star_check = $read_part_tmp =~ tr/\*/\*/; + if ($SNP_active eq "yes" && ($before eq "yes" || $extensions_before eq "yes" || $platform eq "ion") && $count_split eq '2' && ($star_check eq 0 || $star_check eq "" || $platform eq "ion") && $ext < $average_coverage_ext*3) + { + my @check_deletion = check_deletion($best_extension1, $best_extension2, $best_extension_old1, $best_extension_old2, "", ""); + $best_extension = $check_deletion[0]; + if ($best_extension ne "") + { + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSION_DEL\n"; + } + $deletion = "yes"; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + delete $indel_split{$id}; + + if ($heteroplasmy eq "") + { + goto INDEL; + } + else + { + $split = ""; + } + } + } +REFERENCE: + if ($last_150 eq "" && ($reference_next_seed eq "yes" || ((((length($best_extension1) > 4 && length($best_extension2) > 4) || (length($best_extension_old1) > 4 && length($best_extension_old2) > 4)) && $reference ne "" && $SNP_active eq "yes" && $repetitive_detect eq "" && $deletion eq "") || (($variance_detection eq "yes" || $heteroplasmy ne "") && $best_extension ne "" && $repetitive_detect eq "")))) + { + my $p = -30; + my $p_prev = 100; + if ($y > $startprint2) + { + print OUTPUT5 "CHECK_REFERENCE\n\n"; + } + my $ref_part_prev; + my $found_further_back; + my %ref_id3; + my @ref_id3; + my $further = ""; + my $first_last_seq_ref; + my $last_seq_ref; + my $check_back_length = '800'; + +CHECK_REF: while ($p > -$check_back_length && $p > -length($read)) + { + if ($found_further_back ne "yes") + { + undef @ref_id3; + undef %ref_id3; + } + my $read_short_end2_tmp = substr $read, -$check_back_length-200; + if (exists($last_ref_seq_forward{$id}) && $first_last_seq_ref ne "yes") + { + $read_short_end2_tmp = $last_ref_seq_forward{$id}; + if (length($read_short_end2_tmp)+$p < 0) + { + $first_last_seq_ref = "yes"; + $read_short_end2_tmp = $read_short_end2; + } + } + my $ref_part2 = substr $read_short_end2_tmp, $p, 30; + my $star2; + if ($containX_short_end2 > 0) + { + my $star = $ref_part2 =~ tr/\*/\*/; + + $ref_part2 = substr $read_short_end2_tmp, -(-$p+($star*2)), 30+($star*2); + $star2 = $ref_part2 =~ tr/\*/\*/; + while ($star2 > $star) + { + $ref_part2 = substr $read_short_end2_tmp, -(-$p+($star*2)+(($star2-$star)*2)), 30+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $ref_part2 =~ tr/\*/\*/; + } + } + $ref_part2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my %ref_part = build_partial3b $ref_part2, ""; + if ($found_further_back eq "yes") + { + $p++; + } + my $ref_loc = -$p; + + if ($found_further_back eq "") + { + foreach my $ref_part_tmp (keys %ref_part) + { + if (exists($hashref{$ref_part_tmp})) + { + my $ref_id3 = $hashref{$ref_part_tmp}; + my $ref_id2 = substr $ref_id3, 1; + my @ref_id3_tmp; + my @ref_id3_tmp2; + + if ($found_further_back eq "") + { + @ref_id3_tmp = split /,/, $ref_id2; + } + else + { + $ref_part_tmp = $ref_part_prev; + } + + foreach (@ref_id3_tmp) + { + if (($_ < $last_ref_pos_forward{$id}+5000 && $_ > $last_ref_pos_forward{$id}-5000) || $last_ref_pos_forward{$id} eq "") + { + $ref_id3{$_} = $ref_part_tmp; + push @ref_id3_tmp2, $_; + } + } + push @ref_id3, @ref_id3_tmp2; + } + } + } + if ($y > $startprint2) + { + print OUTPUT5 $ref_loc." LOC ".@ref_id3." LOC_REF\n"; + } +CHECKED_BACK_REF: + if (@ref_id3 eq 1) + { + foreach my $ref_id (@ref_id3) + { + if ($reference_next_seed eq "yes") + { + my $next_seed_loc = $ref_id + $ref_loc - $p + 100; + if (exists($hashref2{$next_seed_loc})) + { + $next_seed_ref = $hashref2{$next_seed_loc}.$hashref2{$next_seed_loc+30}.$hashref2{$next_seed_loc+60}.$hashref2{$next_seed_loc+90}.$hashref2{$next_seed_loc+120}.$hashref2{$next_seed_loc+150}.$hashref2{$next_seed_loc+180}.$hashref2{$next_seed_loc+210}.$hashref2{$next_seed_loc+240}.$hashref2{$next_seed_loc+270}; + $SNR_next_seed = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $next_seed_ref." NEXT_SEED_REF\n"; + } + goto FINISH; + } + } + $last_ref_pos_forward{$id} = $ref_id; + my $prev_loc1 = $ref_id + $ref_loc; + my $prev_loc_star = $ref_id + $ref_loc; + my @delete; + if (exists($last_ref_seq_forward{$id}) && $first_last_seq_ref ne "yes") + { + } + else + { + foreach my $var_pos_tmp (keys %variance_forward) + { + my @split = split /\+/, $var_pos_tmp; + + my $var_pos = $split[1]; + if ($split[0] eq $id) + { + if ($var_pos > $ref_id) + { + $prev_loc1 -= $variance_forward{$var_pos_tmp}; + $prev_loc_star -= $variance_forward{$var_pos_tmp}; + } + if ($ref_id + $ref_loc - $check_back_length > $var_pos) + { + push @delete, $var_pos_tmp; + } + } + } + } + foreach my $delete (@delete) + { + delete $variance_forward{$delete}; + } + + my $test_star = substr $read_short_end2_tmp, $p+29; + my $star2b = '0'; + if ($containX_short_end2 > 0) + { + $star2b = $test_star =~ tr/\*/\*/; + } + if ($star2b > 0) + { + $prev_loc1 -= $star2b; + $prev_loc_star -= $star2b*2; + } + + $last_seq_ref = $hashref2{$prev_loc1-30}; + print OUTPUT5 $last_seq_ref." LAST_SEQ_REF\n"; + if (exists($hashref2{$prev_loc1})) + {} + else + { + my $m = '0'; + while (exists($hashref2{$prev_loc1-$ref_loc+$m})) + { + $m++; + } + $prev_loc1 = $ref_loc-$m; + $last_ref_pos_forward{$id} = $prev_loc1; + } + if (exists($hashref2{$prev_loc1})) + { + my $ref_check; + my $j = '0'; + my $e = '1'; + my $prev_loc1_tmp = $prev_loc1; + + my $ref_check_star; + if ($star2 > 0) + { + if (exists($hashref2{$prev_loc_star})) + { + $prev_loc1_tmp = $prev_loc_star; + print OUTPUT5 $prev_loc_star." EXISTSREF_star\n"; + } + } + + while ($j < 130) + { + if (exists($hashref2{$prev_loc1_tmp+$j})) + { + $ref_check .= $hashref2{$prev_loc1_tmp+$j}; + } + elsif ($heteroplasmy ne "" && (length($ref_check) < length($best_extension) || ($split ne "" && (length($ref_check) < length($best_extension1) || length($ref_check) < length($best_extension2))))) + { + my $best_extension_tmp = substr $best_extension, 0, length($ref_check); + if ($split ne "") + { + $best_extension1 = substr $best_extension1, 0, length($ref_check); + $best_extension2 = substr $best_extension2, 0, length($ref_check); + } + $best_extension = $best_extension_tmp; + $last_150 = length($read); + $no_next_seed = "yes"; + print OUTPUT5 "END_REF\n"; + } + $j += 30; + } + $ref_check =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + if ($y > $startprint2) + { + print OUTPUT5 $ref_check." EXISTSREF1 ".$prev_loc1_tmp." PREV_LOC1\n"; + } + + my $best_extension1_tmp; + my $best_extension2_tmp; + my $best_extension3_tmp; + my $best_extension4_tmp; + if (length($best_extension_old1) > length($best_extension1)) + { + $best_extension1_tmp = $best_extension_old1; + } + else + { + $best_extension1_tmp = $best_extension1; + } + if (length($best_extension_old2) > length($best_extension2)) + { + $best_extension2_tmp = $best_extension_old2; + } + else + { + $best_extension2_tmp = $best_extension2; + } + if (length($best_extension_old3) > length($best_extension3)) + { + $best_extension3_tmp = $best_extension_old3; + } + else + { + $best_extension3_tmp = $best_extension3; + } + if (length($best_extension_old4) > length($best_extension4)) + { + $best_extension4_tmp = $best_extension_old4; + } + else + { + $best_extension4_tmp = $best_extension4; + } + + my $best_extension1_part = substr $best_extension1_tmp, 0, 25; + my $best_extension2_part = substr $best_extension2_tmp, 0, 25; + my $best_extension3_part = substr $best_extension3_tmp, 0, 25; + my $best_extension4_part = substr $best_extension4_tmp, 0, 25; + my $best_extension1_partb = substr $best_extension1_tmp, 5, 25; + my $best_extension2_partb = substr $best_extension2_tmp, 5, 25; + my $best_extension3_partb = substr $best_extension3_tmp, 5, 25; + my $best_extension4_partb = substr $best_extension4_tmp, 5, 25; + + my $reference_guided1 = '0'; + my $reference_guided2 = '0'; + my $reference_guided3 = '0'; + my $reference_guided4 = '0'; + my $best_extension_del = $best_extension; + if (($variance_detection eq "yes" || $heteroplasmy ne "") && $split eq "" && $repetitive_detect eq "") + { + my $del_detect = '1'; +DEL_DETECT: if ($deletion eq "yes" && $del_detect eq '1') + { + my $best_extension1_tmp = $best_extension; + $best_extension1_tmp =~ tr/\*//d; + $best_extension = $best_extension1_tmp; + } + elsif ($deletion eq "yes" && $del_detect eq '2') + { + my $best_extension2_tmp = $best_extension_del; + $best_extension2_tmp =~ s/.\*//g; + $best_extension = $best_extension2_tmp; + } + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSION_VAR_DETECT\n\n"; + } + + my $ref_check_tmp = $ref_check; + + if (exists($large_variance_forward{$id})) + { + $ref_check_tmp = $hashref2{$large_variance_forward{$id}}.$hashref2{$large_variance_forward{$id}+30}.$hashref2{$large_variance_forward{$id}+60}.$hashref2{$large_variance_forward{$id}+90}.$hashref2{$large_variance_forward{$id}+120}; + print OUTPUT5 $ref_check_tmp." EXISTSREF1_VAR_DETECT\n\n"; + } + + my $best_extension_tmp = $best_extension; + my $deletion_found = ""; + my $save_seq_ref; + my $max_SNP = 1; + + if (exists($large_variance_forward{$id}) && length($best_extension) < 13) + { + $best_extension_tmp = $read_end.$best_extension; + } +VAR_START: + my @line = split //, $best_extension_tmp; + my @ref_check = split //, $ref_check_tmp; + my $gh = '0'; + my $th = '0'; + my $var_SNP_detect; + +VAR_CHECK: while ($gh < length($ref_check_tmp)-length($best_extension_tmp) && $SNR_read eq "") + { + my $d = '0'; + my $next = '0'; + my @pos; + my $pos; + my $pos_first = ""; + my $v = '4'; + my $AF; + my $DP; + if (length($best_extension_tmp) > 15) + { + $v = '6'; + } + + while ($d < length($best_extension_tmp)) + { + $th = $d + $gh; + if ($line[$d] eq $ref_check[$th]) + { + if ($next ne '0' && (($pos-$prev_loc1-$gh) < $d-$v || $d eq length($best_extension_tmp)-1)) + { + foreach my $pos_tmp (@pos) + { + my $position_tmp; + my $nuc_other_than_ref = ""; + if ($AF eq "") + { + $position_tmp = $pos_first; + $pos_first = $pos_tmp; + } + else + { + $position_tmp = $pos_tmp-$prev_loc1; + } + + my @nucs_count = split /\+/, $allele_percentage{$position_tmp}; + my ($nucs_alt2, $nucs_alt_array) = IUPAC_reverse($line[$pos_tmp-$prev_loc1-$gh]); + + my @nucs_alt = @$nucs_alt_array; + my $nucs_alt; + my $no_variance = ""; + my $deduct_duplications_first = '0'; + my $deduct_low_quality_next_nuc_first = '0'; + my $deduct_mismatch_nuc_first = '0'; + + print OUTPUT5 $allele_percentage{$position_tmp}." TESh2\n"; + my $s = '0'; + my @nucs_order = ('A','C','T','G'); + my $AF_high = '0'; + my %allele_ordered; + if (($nucs_count[0]+$nucs_count[1]+$nucs_count[2]+$nucs_count[3]) > 0) + { + my $h = '0.001'; + foreach (@nucs_alt) + { + my $nuc_tmp = $_-1; + my $AF_tmp = sprintf("%.3g",$nucs_count[$nuc_tmp]/($nucs_count[0]+$nucs_count[1]+$nucs_count[2]+$nucs_count[3])); + print OUTPUT5 $nucs_order[$nuc_tmp]." TESh3\n"; + if ($nucs_order[$nuc_tmp] ne $ref_check[$pos_tmp-$prev_loc1] && @nucs_alt < 3) + { + $nuc_other_than_ref = $nucs_order[$nuc_tmp]; + } + if (exists($allele_ordered{$AF_tmp})) + { + $allele_ordered{$AF_tmp+0.001} = $nuc_tmp; + $h = '0.002'; + } + else + { + $allele_ordered{$AF_tmp} = $nuc_tmp; + } + print OUTPUT5 $AF_tmp." TESh4\n"; + } + my %deduct_duplications; + my $count_allele = '1'; + my $deduct_duplications_total; + foreach my $allele_ordered (sort {$b <=> $a} keys %allele_ordered) + { + if ($s eq '0') + { + $nucs_alt = $nucs_order[$allele_ordered{$allele_ordered}]; + + my %count_ext_first; +HP_NEXT: foreach my $ext_tmp (keys %extensionsb) + { + my $first_nuc = substr $extensionsb{$ext_tmp}, $position_tmp, 1; + my $match_ext; + my $id_match_b = $ext_tmp; + my $id_match_end = substr $id_match_b, -1, 1,"",; + + if (exists($remove_extension_mismatch{$extensionsb{$ext_tmp}})) + { + if ($position_tmp >= $remove_extension_mismatch{$extensionsb{$ext_tmp}}) + { + $deduct_mismatch_nuc_first++; + next HP_NEXT; + } + } + + + if (exists($hash{$id_match_b})) + { + my @id_match_b = split /,/, $hash{$id_match_b}; + + if ($id_match_end eq "1") + { + $match_ext = $id_match_b[0]; + } + elsif ($id_match_end eq "2") + { + $match_ext = $id_match_b[1]; + } + if ($use_quality ne "") + { + my $countN = $match_ext =~ tr/1|2|3|4/N/; + } + if (exists($count_ext_first{length($extensionsb{$ext_tmp})})) + { + my $count = $count_ext_first{length($extensionsb{$ext_tmp})}+1; + $count_ext_first{length($extensionsb{$ext_tmp})} = $count; + } + else + { + $count_ext_first{length($extensionsb{$ext_tmp})} = '0'; + } + } + } + foreach my $count_ext_first (keys %count_ext_first) + { + if ($count_ext_first{$count_ext_first} > ($nucs_count[$allele_ordered{$allele_ordered}]/($read_length-$left-$overlap))*3) + { + $deduct_duplications_first += $count_ext_first{$count_ext_first}-($nucs_count[$allele_ordered{$allele_ordered}]/($read_length-$left-$overlap)); + } + } + $deduct_duplications{0} = $nucs_count[$allele_ordered{$allele_ordered}]-$deduct_duplications_first-$deduct_low_quality_next_nuc_first-$deduct_mismatch_nuc_first; + $deduct_duplications_total = $deduct_duplications_first+$deduct_low_quality_next_nuc_first+$deduct_mismatch_nuc_first; + } + else + { + $nucs_alt .= ",".$nucs_order[$allele_ordered{$allele_ordered}]; + print OUTPUT5 $allele_ordered." ALLELE\n"; + my %count_ext; + my $deduct_low_quality_next_nuc = '0'; + my $deduct_mismatch_nuc = '0'; + +HP_NEXT2: foreach my $ext_tmp (keys %extensionsb) + { + my $first_nuc = substr $extensionsb{$ext_tmp}, $position_tmp, 1; + my $match_ext; + my $match_ext_pair; + if ($first_nuc eq $nucs_order[$allele_ordered{$allele_ordered}]) + { + my $id_match_b = $ext_tmp; + my $id_match_end = substr $id_match_b, -1, 1,""; + + if (exists($remove_extension_mismatch{$extensionsb{$ext_tmp}})) + { + if ($position_tmp >= $remove_extension_mismatch{$extensionsb{$ext_tmp}}) + { + $deduct_mismatch_nuc++; + next HP_NEXT2; + } + } + + if (exists($hash{$id_match_b})) + { + my @id_match_b = split /,/, $hash{$id_match_b}; + + if ($id_match_end eq "1") + { + $match_ext = $id_match_b[0]; + $match_ext_pair = $id_match_b[1]; + } + elsif ($id_match_end eq "2") + { + $match_ext = $id_match_b[1]; + $match_ext_pair = $id_match_b[0]; + } + + $match_ext =~ tr/ACTG/TGAC/; + my $match_ext_reverse = reverse($match_ext); + + if ($use_quality ne "") + { + $match_ext_reverse =~ tr/1|2|3|4/N/, + } + + if (exists($count_ext{length($extensionsb{$ext_tmp})})) + { + my $count = $count_ext{length($extensionsb{$ext_tmp})}+1; + $count_ext{length($extensionsb{$ext_tmp})} = $count; + } + else + { + $count_ext{length($extensionsb{$ext_tmp})} = '0'; + } + } + } + } + my $deduct_duplications = '0'; + foreach my $count_ext (keys %count_ext) + { + if ($count_ext{$count_ext} > ($nucs_count[$allele_ordered{$allele_ordered}]/($read_length-$left-$overlap))*3) + { + $deduct_duplications += $count_ext{$count_ext}-$nucs_count[$allele_ordered{$allele_ordered}]/($read_length-$left-$overlap); + } + } + $deduct_duplications{$count_allele} = $nucs_count[$allele_ordered{$allele_ordered}]-$deduct_duplications-$deduct_low_quality_next_nuc-$deduct_mismatch_nuc; + $deduct_duplications_total += ($deduct_duplications+$deduct_low_quality_next_nuc+$deduct_mismatch_nuc); + $count_allele++; + } + $s++; + } + my $new_total_nuc = $nucs_count[0]+$nucs_count[1]+$nucs_count[2]+$nucs_count[3]-$deduct_duplications_total; + $no_variance = "yes"; + foreach my $deduct_duplications_tmp (sort {$a <=> $b} keys %deduct_duplications) + { + my $AF_tmp = sprintf("%.3g",($deduct_duplications{$deduct_duplications_tmp})/$new_total_nuc); + print OUTPUT5 $AF_tmp." NO_VARIANCE\n"; + if ($deduct_duplications_tmp eq '0') + { + $AF = $AF_tmp; + } + elsif ($AF_tmp >= $heteroplasmy) + { + $AF .= ",".$AF_tmp; + $no_variance = ""; + } + } + } + $DP = $allele_total{$position_tmp}; + + my $nuc_in_ext_hp = substr $best_extension, $pos_tmp-$prev_loc1, 1; + if ($hp_seed_assemble eq "" && $no_variance ne "yes") + { + if (exists($variance_all{$pos_tmp})) + { + my $one; + my $two; + if ($variance_all{$pos_tmp} =~ m/^\S*\s\S*\s\S*\t(\S*)\s(\S*)\s.*/) + { + $one = substr $1, 1; + $two = substr $2, 1; + } + $variance_all_SNP{$pos_tmp} = $nucs_alt; + $variance_all{$pos_tmp."b"} = $chromosome."\t".$pos_tmp."\t".".\t".$ref_check[$pos_tmp-$prev_loc1]."\t".$nucs_alt."\t.\t.\tAF=".$AF.";DP=".$DP; + $variance_all{$pos_tmp} = $chromosome."\t".$pos_tmp."\t".".\t".$ref_check[$pos_tmp-$prev_loc1].$one."\t".$nucs_alt.$two."\t.\t.\tAF=".$AF.";DP=".$DP; + } + else + { + $variance_all_SNP{$pos_tmp} = $nucs_alt; + $variance_all{$pos_tmp} = $chromosome."\t".$pos_tmp."\t".".\t".$ref_check[$pos_tmp-$prev_loc1]."\t".$nucs_alt."\t.\t.\tAF=".$AF.";DP=".$DP; + } + } + elsif ($ref_check[$pos_tmp-$prev_loc1] ne $nuc_in_ext_hp && ($nuc_in_ext_hp eq "A" || $nuc_in_ext_hp eq "C" || $nuc_in_ext_hp eq "T" || $nuc_in_ext_hp eq "G")) + { + if (exists($linked_half_SNPs_exclude{$pos_tmp})) + { + } + else + { + my $pos_SNPs = $position+$pos_tmp-$prev_loc1+1; + $SNPs{$pos_SNPs} = $nuc_in_ext_hp; + } + $linked_SNPs{$pos_tmp} = undef; + } + elsif ($ref_check[$pos_tmp-$prev_loc1] ne $nuc_in_ext_hp && $no_variance ne "yes") + { + my $test_c = '0'; + foreach my $l_h_SNPs (keys %linked_half_SNPs) + { + $test_c++; + } + if ($test_c > 0) + { + $noforward{$id} = "stop_HP"; + print OUTPUT5 "STOP_HALF_LINKED\n";; + } + $linked_half_SNPs{$pos_tmp} = undef; + if ($PCR_free ne "yes" && $test_c eq '0') + { + my $pos_SNPs = $position+$pos_tmp-$prev_loc1+1; + if ($nuc_other_than_ref ne "") + { + my $best_extension_tmp = $best_extension; + substr $best_extension_tmp, $pos_tmp-$prev_loc1, 1, $ref_check[$pos_tmp-$prev_loc1]; + my $all_linked_hps; + foreach my $SNPs (keys %SNPs) + { + $all_linked_hps .= "-".$SNPs+$position_back; + } + my $all_linked_hps_pos; + foreach my $linked_SNPs (keys %linked_SNPs) + { + $all_linked_hps_pos .= "-".$linked_SNPs; + } + $linked_half_SNPs2{$read.$best_extension_tmp} .= exists $linked_half_SNPs2{$read.$best_extension_tmp} ? ",".$id."+".$all_linked_hps."+".$all_linked_hps_pos : $id."+".$all_linked_hps."+".$all_linked_hps_pos; + substr $best_extension, $pos_tmp-$prev_loc1, 1, $nuc_other_than_ref; + + $SNPs{$pos_SNPs} = $nuc_other_than_ref; + $linked_SNPs{$pos_tmp} = undef; + } + } + if ($nuc_in_ext_hp ne "A" && $nuc_in_ext_hp ne "C" && $nuc_in_ext_hp ne "T" && $nuc_in_ext_hp ne "G" && $first_linked_half_SNP_pos eq "dQFQF") + { + foreach my $SNP_tmp (keys %allele_ordered) + { + if ($allele_ordered{$SNP_tmp} ne $ref_check[$pos_tmp-$prev_loc1]) + { + my $best_ext_tmp = $best_extension; + substr $best_ext_tmp, $pos_tmp-$prev_loc1, 1, $allele_ordered{$SNP_tmp}; + my @best_ext_tmp = split //, $best_ext_tmp; + my $stop = '0'; + my $l = '0'; + foreach my $nuc_tmp (@best_ext_tmp) + { + $l++; + if ($nuc_tmp ne "A" && $nuc_tmp ne "C" && $nuc_tmp ne "T" && $nuc_tmp ne "G" && $stop eq '0') + { + $stop = $l; + } + } + if ($stop ne '0') + { + my $end = $stop-1; + my $best_ext_tmp_tmp = $best_ext_tmp; + $best_ext_tmp = substr $best_ext_tmp_tmp, 0, $end; + } + $first_linked_half_SNP_read = $read.$best_ext_tmp; + $first_linked_half_SNP_pos = length($best_ext_tmp); + } + } + } + } + print OUTPUT5 $pos_tmp." POS_TMP\n"; + } + $next = '0'; + $var_SNP_detect = "yes"; + undef @pos; + } + elsif ($hp_seed_assemble ne "") + { + foreach my $pos_tmp (keys %variance_all_SNP) + { + if ($pos_tmp eq $prev_loc1_tmp+$d) + { + $not_linked_SNPs{$pos_tmp} = undef; + print OUTPUT5 $pos_tmp." NOT_LINKED\n"; + } + } + } + } + elsif ($ref_check[$th] eq ".") + { + } + elsif ($line[$d] eq ".") + { + } + elsif ($next < $max_SNP || $deletion_found eq "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 $ref_check[$th]." VAR_DETECT\n"; + } + + $pos = $prev_loc1+$th; + $next++; + my $count_pos_tmp = @pos; + + push @pos, $prev_loc1+$th; + if ($pos_first eq "") + { + $pos_first = $d; + } + if ($d > length($best_extension_tmp)-6 && length($best_extension_tmp) > 6 && $count_pos_tmp eq '0') + { + $best_extension = substr $best_extension, 0, -5; + last VAR_CHECK; + } + if ($d eq '0') + { + $max_SNP = length($best_extension)/4; + if ($max_SNP < 3) + { + $max_SNP = 3; + } + if (length($best_extension) > 15 && $max_SNP < 6) + { + $max_SNP = 6; + } + if (length($best_extension) > 25 && $max_SNP < 7) + { + $max_SNP = 7; + } + } + } + elsif ($next > 0 && $pos_first ne '0') + { + $best_extension = substr $best_extension, 0, $pos_first; + $save_seq_ref = "no"; + last VAR_CHECK; + } + elsif ($next >= $max_SNP && $pos_first eq '0' && $deletion_found ne "yes") + { + my @check_deletion = check_deletion($best_extension_tmp, $ref_check_tmp,"","","yes", ""); + my $var_deletion = $check_deletion[0]; + my $one_or_two = $check_deletion[1]; + my $shorter = $check_deletion[2]; + print OUTPUT5 $var_deletion." VAR_DEL_DETECT\n"; + $save_seq_ref = "no"; + + if ($var_deletion =~ m/(.*)\*(.*)?/) + { + my $deletion = $1; + my $after_deletion = $2; + $deletion =~ tr/\*//d; + my $last_nuc = substr $read_end, -1; + my $loc_in_ref = $prev_loc1-$gh; + my $deletion_length0; + if (exists($large_variance_forward{$id})) + { + my $deletion_length = $prev_loc1-$large_variance_forward{$id}; + my $deletion_length_tmp = length($deletion); + + my $deleted_part = substr $read_short_end2, -$deletion_length; + $loc_in_ref = $large_variance_forward{$id}; + + if (length($best_extension) < 13) + { + $deletion_length_tmp = $overlap - length($deletion); + } + if ($one_or_two eq "one") + { + $deletion_length += $deletion_length_tmp; + $deleted_part .= $deletion; + $loc_in_ref -= 1; + } + else + { + $deletion_length -= $deletion_length_tmp; + substr $deleted_part, -$deletion_length_tmp, $deletion_length_tmp, ""; + } + + if (length($best_extension) < 13) + { + if ($overlap > length($deletion)) + { + $deleted_part = substr $read_short_end2, -($deletion_length-$deletion_length_tmp), -$deletion_length_tmp; + } + } + print OUTPUT5 $deletion_length." DELETION_LENGTH\n"; + print OUTPUT5 $deletion_length_tmp." DELETION_LENGTH2\n"; + + $deletion = $deleted_part; + $last_nuc = substr $hashref2{$large_variance_forward{$id}-1}, 0, 1; + + $deletion_length0 = $deletion_length; + print OUTPUT5 $deletion_length." DELETION_LENGTHb\n"; + print OUTPUT5 $large_variance_length_forward{$id}." DELETION_LENGTH2b\n"; + if ($large_variance_length_forward{$id} > $deletion_length_tmp) + { + if ($one_or_two eq "two") + { + $loc_in_ref -= 1; + } + $one_or_two = "one"; + } + elsif ($large_variance_length_forward{$id} < $deletion_length_tmp) + { + $one_or_two = "two"; + } + print OUTPUT5 $deleted_part." LARGE_DELETION\n"; + } + + my $tmp = '0'; + my $check_rep = substr $hashref2{$loc_in_ref-60}.$hashref2{$loc_in_ref-30}, -length($deletion)-$tmp, length($deletion); + while ($check_rep eq $deletion && $deletion ne "" && $check_rep ne "") + { + $tmp++; + $check_rep = substr $hashref2{$loc_in_ref-60}.$hashref2{$loc_in_ref-30}, -length($deletion)-$tmp, length($deletion); + } + if ($tmp ne '0') + { + $tmp += length($deletion); + } + my $pos_tmp = $loc_in_ref-1; + my $last_nucb = $last_nuc; + my $tmpie = $loc_in_ref-$tmp; + if ($hp_seed_assemble eq "") + { + if (exists($variance_all{$pos_tmp})) + { + if ($variance_all{$pos_tmp} =~ m/^\S*\s\S*\s\S*\s(\S*)\s(\S*)\s.*/) + { + $last_nuc = $1; + $last_nucb = $2; + print OUTPUT5 $variance_all{$pos_tmp}." ALLb\n"; + print OUTPUT5 $last_nuc." ONEb\n"; + print OUTPUT5 $last_nucb." TWOb\n"; + } + $variance_all{$pos_tmp."b"} = $variance_all{$pos_tmp}; + delete $variance_all{$pos_tmp}; + } + if ($one_or_two eq "one") + { + $variance_all{$pos_tmp} = $chromosome."\t".$pos_tmp."\t".".\t".$last_nuc."\t".$last_nucb.$deletion."\t.\t.\t."; + $variance_forward{$id."+".$tmpie} = length($deletion); + } + elsif($one_or_two eq "two") + { + $variance_all{$pos_tmp} = $chromosome."\t".$pos_tmp."\t".".\t".$last_nuc.$deletion."\t".$last_nucb."\t.\t.\t."; + $variance_forward{$id."+".$tmpie} = -length($deletion); + } + } + my $check_dot = $after_deletion =~ tr/\./\./; + if ($shorter > 0) + { + substr $best_extension, -$shorter, $shorter, ""; + print OUTPUT5 $best_extension." BEST_EXTENSION_SHORTER\n"; + } + + if ($deletion_length0 < $large_variance_length_forward{$id} && exists($large_variance_forward{$id})) + { + if ($one_or_two eq "one") + { + $best_extension_tmp = substr $read_short_end2, -$large_variance_length_forward{$id}+$deletion_length0; + $best_extension_tmp .= $best_extension; + $prev_loc1 = $prev_loc1-($prev_loc1-$large_variance_forward{$id}); + } + elsif($one_or_two eq "two") + { + $ref_check_tmp = substr $ref_check, length($deletion); + } + $deletion_found = "yes"; + delete $large_variance_forward{$id}; + delete $large_variance_length_forward{$id}; + goto VAR_START; + } + + delete $large_variance_forward{$id}; + delete $large_variance_length_forward{$id}; + if ($check_dot > 0) + { + if ($one_or_two eq "one") + { + $best_extension_tmp = substr $best_extension, length($deletion); + } + elsif($one_or_two eq "two") + { + $ref_check_tmp = substr $ref_check, length($deletion); + } + $deletion_found = "yes"; + goto VAR_START; + } + } + if ($var_deletion ne "") + { + last VAR_CHECK; + } + elsif ($max_SNP eq '1') + { + $max_SNP = length($best_extension)/4; + if ($max_SNP < 3) + { + $max_SNP = 3; + } + if (length($best_extension) > 15 && $max_SNP < 6) + { + $max_SNP = 6; + } + if (length($best_extension) > 25 && $max_SNP < 7) + { + $max_SNP = 7; + } + undef @pos; + goto VAR_START; + } + else + { + if (exists($large_variance_forward{$id})) + {} + else + { + print OUTPUT5 $prev_loc1-$gh."\tLARGE_VARIANCE\n"; + $large_variance_forward{$id} = $prev_loc1; + } + last VAR_CHECK; + } + } + else + { + $gh++; + last VAR_CHECK; + } + $d++ + } + last VAR_CHECK; + } + + if ($best_extension ne "") + { + $indel_split = '0'; + delete $indel_split{$id}; + } + foreach my $var_pos_tmp (keys %variance_forward) + { + my @split = split /\+/, $var_pos_tmp; + my $var_pos = $split[1]; + if ($split[0] eq $id) + { + if ($ref_id + $ref_loc - (length($last_seq_ref.$best_extension)) > $var_pos) + { + $save_seq_ref = "no"; + } + } + } + if ($save_seq_ref ne "no") + { + $last_ref_seq_forward{$id} = $last_seq_ref; + $last_pos_seq_forward{$id} = $prev_loc1_tmp; + } + else + { + delete $last_ref_seq_forward{$id}; + } + if ($deletion eq "yes" && $del_detect eq '1') + { + $del_detect = '2'; + goto DEL_DETECT; + } + if ($deletion eq "yes") + { + $best_extension = $best_extension_del; + } + $SNR_test = ""; + $best_extension_prev{$id} = $best_extension; + goto AFTER_EXT; + } + else + { + delete $last_ref_seq_forward{$id}; + } + + if (length($best_extension1_part) > 5) + { + my $ref_check_tmp = $ref_check; + my $best_extension1_part_tmp = $best_extension1_part; + my $steps = '0'; + my $cut_ext = '0'; +EXT1_PART0: $best_extension1_part_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my @line = split //,$best_extension1_part_tmp; + my @ref_check = split //, $ref_check_tmp; + my $gh = '0'; + my $th = '0'; + +EXT1_PART: while ($gh < length($ref_check_tmp)-length($best_extension1_part_tmp)) + { + my $d = '0'; + my $next = '0'; + + while ($d < length($best_extension1_part_tmp)) + { + $th = $d + $gh; + if ($line[$d] eq $ref_check[$th]) + { + } + elsif ($ref_check[$th] eq ".") + { + } + elsif ($line[$d] eq ".") + { + } + elsif ($next < length($best_extension1_part_tmp)*0.25) + { + $next++; + } + else + { + $gh++; + goto EXT1_PART; + } + $d++ + } + + $reference_guided1 += $next+($gh/2)+1+$cut_ext; + + if ($reference_guided1 < 25) + { + $reference_guided = "yes1"; + print OUTPUT5 $reference_guided1." REFERENCE_GUIDED1\n"; + print OUTPUT5 $best_extension1_part_tmp." BEST_EXTENSION1\n\n"; + $best_extension = $best_extension1_tmp; + last EXT1_PART; + } + else + { + $reference_guided1 = '0'; + $gh++; + goto EXT1_PART; + } + } + if ($reference_guided eq "" && $star2 > 0 && ($steps < 1 || ($steps < 2 && $best_extension1_part_tmp eq $best_extension1_partb))) + { + $ref_check_tmp = $ref_check_star; + $steps++; + goto EXT1_PART0; + } + if ($reference_guided eq "" && $cut_ext < 5 && length($best_extension1_tmp)-$cut_ext > 10) + { + $ref_check_tmp = $ref_check; + $cut_ext++; + $best_extension1_part_tmp = substr $best_extension1_tmp, $cut_ext, 25; + if ($cut_ext eq 5) + { + $steps += 2; + if ($star2 > 0) + { + $steps--; + } + } + goto EXT1_PART0; + } + } + if ($reference_guided ne "yes1") + { +EXT1_PART_single: foreach my $extensions_group1 (@extensions_group1_old) + { + if (length($extensions_group1) > 15) + { + my $best_extension1_tmp_6 = substr $best_extension1_tmp, 0, 6; + if ($extensions_group1 =~ s/$best_extension1_tmp_6/$best_extension1_tmp_6/) + { + my $extensions_group1_part = substr $extensions_group1, 0, 25; + my @ref1_single = build_partialb_4dots $extensions_group1_part; + foreach my $best_extension1_part2_single (@ref1_single) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension1_part2_single/$best_extension1_part2_single/ || $ref_check_tmp2 =~ s/$best_extension1_part2_single/$best_extension1_part2_single/) + { + print OUTPUT5 "REFERENCE_GUIDED1b\n"; + print OUTPUT5 $extensions_group1." BEST_EXTENSION1_single\n\n"; + $best_extension = $best_extension1_tmp; + $reference_guided = "yes1"; + last EXT1_PART_single; + } + } + undef @ref1_single; + } + } + } + } + if (length($best_extension2_part) > 5) + { + my $ref_check_tmp = $ref_check; + my $best_extension2_part_tmp = $best_extension2_part; + my $steps = '0'; + my $cut_ext = '0'; +EXT2_PART0: $best_extension2_part_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my @line = split //,$best_extension2_part_tmp; + my @ref_check = split //, $ref_check_tmp; + my $gh = '0'; + my $th = '0'; + +EXT2_PART: while ($gh < length($ref_check_tmp)-length($best_extension2_part_tmp)) + { + my $d = '0'; + my $next = '0'; + + while ($d < length($best_extension2_part_tmp)) + { + $th = $d + $gh; + if ($line[$d] eq $ref_check[$th]) + { + } + elsif ($ref_check[$th] eq ".") + { + } + elsif ($line[$d] eq ".") + { + } + elsif ($next < length($best_extension2_part_tmp)*0.25) + { + $next++; + } + else + { + $gh++; + goto EXT2_PART; + } + $d++ + } + $reference_guided2 += $next+($gh/2)+1+$cut_ext; + + if ($reference_guided2 > 24) + { + $reference_guided2 = '0'; + $gh++; + goto EXT2_PART; + } + + if ($reference_guided ne "" && $reference_guided2 > 7*$reference_guided1) + { + last EXT2_PART; + } + elsif ($reference_guided ne "" && $reference_guided1 > 7*$reference_guided2) + { + $reference_guided = ""; + } + if ($reference_guided eq "") + { + $reference_guided = "yes2"; + } + else + { + $reference_guided = "yes_both"; + } + + print OUTPUT5 $reference_guided2." REFERENCE_GUIDED2\n"; + print OUTPUT5 $best_extension2_part_tmp." BEST_EXTENSION2\n\n"; + $best_extension = $best_extension2_tmp; + last EXT2_PART; + } + if ($reference_guided2 eq 0 && $star2 > 0 && ($steps < 1 || ($steps < 2 && $best_extension2_part_tmp eq $best_extension2_partb))) + { + $ref_check_tmp = $ref_check_star; + $steps++; + goto EXT2_PART0; + } + if ($reference_guided2 eq 0 && $cut_ext < 5 && length($best_extension2_tmp)-$cut_ext > 10) + { + $ref_check_tmp = $ref_check; + $cut_ext++; + $best_extension2_part_tmp = substr $best_extension2_tmp, $cut_ext, 25; + if ($cut_ext eq 5) + { + $steps += 2; + if ($star2 > 0) + { + $steps--; + } + } + goto EXT2_PART0; + } + } + if ($reference_guided ne "yes2" && $reference_guided ne "yes_both") + { +EXT2_PART_single: foreach my $extensions_group2 (@extensions_group2_old) + { + if (length($extensions_group2) > 15) + { + my $best_extension2_tmp_6 = substr $best_extension2_tmp, 0, 6; + if ($extensions_group2 =~ s/$best_extension2_tmp_6/$best_extension2_tmp_6/) + { + my $extensions_group2_part = substr $extensions_group2, 0, 25; + my @ref2_single = build_partialb_4dots $extensions_group2_part; + foreach my $best_extension2_part2_single (@ref2_single) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension2_part2_single/$best_extension2_part2_single/ || $ref_check_tmp2 =~ s/$best_extension2_part2_single/$best_extension2_part2_single/) + { + if ($reference_guided eq "") + { + $reference_guided = "yes2"; + } + else + { + $reference_guided = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED2c\n"; + print OUTPUT5 $extensions_group2." BEST_EXTENSION2_single\n\n"; + $best_extension = $best_extension2_tmp; + last EXT2_PART_single; + } + } + undef @ref2_single; + } + } + } + } + if ($count_split > 2) + { + if (length($best_extension3_part) > 5) + { + my $ref_check_tmp = $ref_check; + my $best_extension3_part_tmp = $best_extension3_part; + my $steps = '0'; + my $cut_ext = '0'; + EXT3_PART0: $best_extension3_part_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my @line = split //,$best_extension3_part_tmp; + my @ref_check = split //, $ref_check_tmp; + my $gh = '0'; + my $th = '0'; + + EXT3_PART: while ($gh < length($ref_check_tmp)-length($best_extension3_part_tmp)) + { + my $d = '0'; + my $next = '0'; + + while ($d < length($best_extension3_part_tmp)) + { + $th = $d + $gh; + if ($line[$d] eq $ref_check[$th]) + { + } + elsif ($ref_check[$th] eq ".") + { + } + elsif ($line[$d] eq ".") + { + } + elsif ($next < length($best_extension3_part_tmp)*0.25) + { + $next++; + } + else + { + $gh++; + goto EXT3_PART; + } + $d++ + } + $reference_guided3 += $next+($gh/2)+1+$cut_ext; + + if ($reference_guided3 > 24) + { + $reference_guided3 = '0'; + $gh++; + goto EXT3_PART; + } + + if ($reference_guided ne "" && $reference_guided3 > 7*$reference_guided1 && $reference_guided3 > 7*$reference_guided2) + { + last EXT3_PART; + } + elsif ($reference_guided eq "yes1" && $reference_guided1 > 7*$reference_guided3) + { + $reference_guided = ""; + } + elsif ($reference_guided eq "yes2" && $reference_guided2 > 7*$reference_guided3) + { + $reference_guided = ""; + } + if ($reference_guided eq "") + { + $reference_guided = "yes3"; + } + else + { + $reference_guided = "yes_both"; + } + + print OUTPUT5 $reference_guided3." REFERENCE_GUIDED3\n"; + print OUTPUT5 $best_extension3_part_tmp." BEST_EXTENSION3\n\n"; + $best_extension = $best_extension3_tmp; + last EXT3_PART; + } + if ($reference_guided3 eq 0 && $star2 > 0 && ($steps < 1 || ($steps < 2 && $best_extension3_part_tmp eq $best_extension3_partb))) + { + $ref_check_tmp = $ref_check_star; + $steps++; + goto EXT3_PART0; + } + if ($reference_guided3 eq 0 && $cut_ext < 5 && length($best_extension3_tmp)-$cut_ext > 10) + { + $ref_check_tmp = $ref_check; + $cut_ext++; + $best_extension3_part_tmp = substr $best_extension3_tmp, $cut_ext, 25; + if ($cut_ext eq 5) + { + $steps += 2; + if ($star2 > 0) + { + $steps--; + } + } + goto EXT3_PART0; + } + } + if ($reference_guided ne "yes3" && $reference_guided ne "yes_both") + { +EXT3_PART_single: foreach my $extensions_group3 (@extensions_group3_old) + { + if (length($extensions_group3) > 15) + { + my $best_extension3_tmp_6 = substr $best_extension3_tmp, 0, 6; + if ($extensions_group3 =~ s/$best_extension3_tmp_6/$best_extension3_tmp_6/) + { + my $extensions_group3_part = substr $extensions_group3, 0, 25; + my @ref2_single = build_partialb_4dots $extensions_group3_part; + foreach my $best_extension3_part2_single (@ref2_single) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension3_part2_single/$best_extension3_part2_single/ || $ref_check_tmp2 =~ s/$best_extension3_part2_single/$best_extension3_part2_single/) + { + if ($reference_guided eq "") + { + $reference_guided = "yes3"; + } + else + { + $reference_guided = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED3c\n"; + print OUTPUT5 $extensions_group3." BEST_EXTENSION3_single\n\n"; + $best_extension = $best_extension3_tmp; + last EXT3_PART_single; + } + } + undef @ref2_single; + } + } + } + } + } + if ($count_split > 3) + { + if (length($best_extension4_part) > 10) + { + my $ref_check_tmp = $ref_check; + my $best_extension4_part_tmp = $best_extension4_part; + my $steps = '0'; + my $cut_ext = '0'; +EXT4_PART0: $best_extension4_part_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my @line = split //,$best_extension4_part_tmp; + my @ref_check = split //, $ref_check_tmp; + my $gh = '0'; + my $th = '0'; + +EXT4_PART: while ($gh < length($ref_check_tmp)-length($best_extension4_part_tmp)) + { + my $d = '0'; + my $next = '0'; + + while ($d < length($best_extension4_part_tmp)) + { + $th = $d + $gh; + if ($line[$d] eq $ref_check[$th]) + { + } + elsif ($ref_check[$th] eq ".") + { + } + elsif ($line[$d] eq ".") + { + } + elsif ($next < length($best_extension4_part_tmp)*0.25) + { + $next++; + } + else + { + $gh++; + goto EXT4_PART; + } + $d++ + } + $reference_guided4 += $next+($gh/2)+1+$cut_ext; + + if ($reference_guided4 > 24) + { + $reference_guided4 = '0'; + $gh++; + goto EXT4_PART; + } + + if ($reference_guided ne "" && $reference_guided4 > 7*$reference_guided1 && $reference_guided4 > 7*$reference_guided2) + { + last EXT4_PART; + } + elsif ($reference_guided eq "yes1" && $reference_guided1 > 7*$reference_guided4) + { + $reference_guided = ""; + } + elsif ($reference_guided eq "yes2" && $reference_guided2 > 7*$reference_guided4) + { + $reference_guided = ""; + } + elsif ($reference_guided eq "yes3" && $reference_guided3 > 7*$reference_guided4) + { + $reference_guided = ""; + } + if ($reference_guided eq "") + { + $reference_guided = "yes4"; + } + else + { + $reference_guided = "yes_both"; + } + + print OUTPUT5 $reference_guided4." REFERENCE_GUIDED4\n"; + print OUTPUT5 $best_extension4_part_tmp." BEST_EXTENSION4\n\n"; + $best_extension = $best_extension4_tmp; + last EXT4_PART; + } + if ($reference_guided4 eq 0 && $star2 > 0 && ($steps < 1 || ($steps < 2 && $best_extension4_part_tmp eq $best_extension4_partb))) + { + $ref_check_tmp = $ref_check_star; + $steps++; + goto EXT4_PART0; + } + if ($reference_guided4 eq 0 && $cut_ext < 5 && length($best_extension4_tmp)-$cut_ext > 10) + { + $ref_check_tmp = $ref_check; + $cut_ext++; + $best_extension4_part_tmp = substr $best_extension4_tmp, $cut_ext, 25; + if ($cut_ext eq 5) + { + $steps += 2; + if ($star2 > 0) + { + $steps--; + } + } + goto EXT4_PART0; + } + } + if ($reference_guided ne "yes4" && $reference_guided ne "yes_both") + { +EXT4_PART_single: foreach my $extensions_group4 (@extensions_group4_old) + { + if (length($extensions_group4) > 15) + { + my $best_extension4_tmp_6 = substr $best_extension4_tmp, 0, 6; + if ($extensions_group4 =~ s/$best_extension4_tmp_6/$best_extension4_tmp_6/) + { + my $extensions_group4_part = substr $extensions_group4, 0, 25; + my @ref2_single = build_partialb_4dots $extensions_group4_part; + foreach my $best_extension4_part2_single (@ref2_single) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension4_part2_single/$best_extension4_part2_single/ || $ref_check_tmp2 =~ s/$best_extension4_part2_single/$best_extension4_part2_single/) + { + if ($reference_guided eq "") + { + $reference_guided = "yes4"; + } + else + { + $reference_guided = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED4c\n"; + print OUTPUT5 $extensions_group4." BEST_EXTENSION4_single\n\n"; + $best_extension = $best_extension4_tmp; + last EXT4_PART_single; + } + } + undef @ref2_single; + } + } + } + } + } + if ($reference_guided eq "yes_both") + { + $best_extension = ""; + $reference_guided = ""; + goto INDELa0 + } + elsif ($reference_guided ne "") + { + $reference_guided = "yes"; + delete $seed{$id}; + delete $seed{$id_old}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + goto AFTER_EXT; + } + else + { + $best_extension = ""; + } + } + } + last CHECK_REF; + } + elsif (length($read) > 1100 && @ref_id3 > 1 && $p < $p_prev-15) + { +CHECK_BACK_REF0: $p_prev = $p; + my $length_back = '1000'; + if ($further eq "yes") + { + $length_back = '5000'; + } + elsif ($further eq "yes2") + { + $length_back = '10000'; + } + elsif ($further eq "yes3") + { + $length_back = '20000'; + } + my $read_part_back = substr $read, -$length_back-$p-150, 200; + my @ref_id3_new; + undef @ref_id3_new; + my $ref_part_tmp2; +CHECK_BACK_REF: foreach my $ref_id (@ref_id3) + { + $ref_part_tmp2 = $ref_id3{$ref_id}; + if (exists($hashref2{$ref_id-$length_back})) + { + my $ref_part_back = $hashref2{$ref_id-$length_back}; + print OUTPUT5 $ref_part_back. " REF_PART_BACK\n"; + my @ref_part_back = build_partialb_4dots $ref_part_back; + foreach my $ref_part_back_tmp (@ref_part_back) + { + my $found = $read_part_back =~ s/$ref_part_back_tmp/$ref_part_back_tmp/; + if ($found > 0) + { + push @ref_id3_new, $ref_id; + next CHECK_BACK_REF; + } + } + undef @ref_part_back; + } + } + if (@ref_id3_new eq 1) + { + undef @ref_id3; + @ref_id3 = @ref_id3_new; + $found_further_back = "yes"; + $ref_part_prev = $ref_part_tmp2; + } + elsif ($further eq "" && length($read) > 5200-$p) + { + $further = "yes"; + goto CHECK_BACK_REF0; + } + elsif ($further eq "yes" && length($read) > 10200-$p) + { + $further = "yes2"; + goto CHECK_BACK_REF0; + } + elsif ($further eq "yes2" && length($read) > 20200-$p) + { + $further = "yes3"; + goto CHECK_BACK_REF0; + } + else + { + undef @ref_id3; + } + } + $p--; + } + if ($reference_next_seed eq "yes") + { + $noforward = "stop"; + $noforward{$id} = "stop"; + goto FINISH; + } + if (($variance_detection eq "yes" || $heteroplasmy ne "") && $split eq "" && $repetitive_detect eq "") + { + if ($best_extension ne "") + { + $indel_split = '0'; + delete $indel_split{$id}; + } + $SNR_test = ""; + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSION\n\n"; + } + $best_extension_prev{$id} = $best_extension; + goto AFTER_EXT; + } + } +INDELa0: + if (((length($best_extension1) > 4 && length($best_extension2) > 4) || (length($best_extension_old1) > 4 && length($best_extension_old2) > 4)) && $cp_input ne "" && $type eq "mito_plant" && $reference_guided eq "" && $deletion eq "") + { + my $p = -35; + if ($y > $startprint2) + { + print OUTPUT5 "CHECK_CHLOROPLAST SEQUENCE\n\n"; + } + while ($p > (-35*2)) + { + my $ref_part2 = substr $read_short_end2, $p, 35; + my %ref_part = build_partial3b $ref_part2, ""; + + foreach my $ref_part (keys %ref_part) + { + if (exists($cp_ref{$ref_part})) + { + my $ref_loc = -$p; + + my $ref_id3 = $cp_ref{$ref_part}; + my $ref_id2 = substr $ref_id3, 1; + my @ref_id3 = split /,/,$ref_id2; + if ($y > $startprint2) + { + print OUTPUT5 $ref_part." EXISTS ".$ref_loc." LOC ".$ref_id3." LOC_REF\n"; + } + + foreach my $ref_id (@ref_id3) + { + my $prev_loc1 = $ref_id + $ref_loc; + + if (exists($cp_ref2{$prev_loc1})) + { + my $ref_check = $cp_ref2{$prev_loc1}; + print OUTPUT5 $ref_check." EXISTSREF1\n"; + + my $best_extension1_tmp; + my $best_extension2_tmp; + if (length($best_extension_old1) > length($best_extension1) && length($best_extension_old2) > length($best_extension2)) + { + $best_extension1_tmp = $best_extension_old1; + $best_extension2_tmp = $best_extension_old2; + } + else + { + $best_extension1_tmp = $best_extension1; + $best_extension2_tmp = $best_extension2; + } + my $best_extension1_part = substr $best_extension1_tmp, 0, 25; + my $best_extension2_part = substr $best_extension2_tmp, 0, 25; + + if (length($best_extension1_part) > 10) + { + my $ref_check_tmp = $ref_check; + if ($ref_check_tmp =~ s/$best_extension1_part/$best_extension1_part/) + { + print OUTPUT5 "REFERNCE_GUIDED_CP\n"; + print OUTPUT5 $best_extension2_tmp." BEST_EXTENSION2n\n"; + $best_extension = $best_extension2_tmp; + $reference_guided = "yes"; + } + } + if (length($best_extension2_part) > 10) + { + my $ref_check_tmp = $ref_check; + if ($ref_check_tmp =~ s/$best_extension2_part/$best_extension2_part/) + { + $best_extension = $best_extension1_tmp; + if ($reference_guided eq "") + { + $reference_guided = "yes"; + } + else + { + $reference_guided = "yes2"; + } + print OUTPUT5 $reference_guided." REFERNCE_GUIDED_CP\n"; + print OUTPUT5 $best_extension1_tmp." BEST_EXTENSION1\n\n"; + } + } + if ($reference_guided eq "yes") + { + $reference_guided = "yes"; + goto INDELa; + } + elsif ($reference_guided eq "yes2") + { + $best_extension = ""; + $reference_guided = ""; + goto INDELa + } + else + { + $best_extension = ""; + } + } + } + } + } + $p--; + } + } + +INDELa: + if (($last_chance ne "yes" || length($read) <= $insert_size || $deletion eq "yes") && $before eq "yes" && $indel_split_skip ne "yes" && $ext_total > 15 && ($delete_first ne "yes" || (length($best_extension1) < 5 && length($best_extension2) < 5)) && $indel_split < ($read_length-25 -$overlap)) + { + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE SPLIT\n\n"; + } + } + elsif ((($before eq "yes" && $SNP_active eq "") || $extensions_before eq "yes") && ($delete_first ne "yes" || (length($best_extension1) < 5 && length($best_extension2) < 5 && length($best_extension3) < 5 && length($best_extension4) < 5))) + { + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $SNP_active{$id_original} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE SPLIT2\n\n"; + } + } + else + { + my $best_extension1_tmpi = $best_extension1; + my $best_extension2_tmpi = $best_extension2; + $best_extension1_tmpi =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//d; + $best_extension2_tmpi =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//d; + if ($y > $startprint2) + { + print OUTPUT5 "ELSE\n"; + } + if ($before eq "yes" && $SNP_active ne "") + { + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + } + if ($SNR_read eq "yes" && length($best_extension1_tmpi) < 7 && length($best_extension2_tmpi) < 7) + { + $SNR_next_seed = "yes"; + $noforward{$id} = "stop"; + $noforward = "stop"; + delete $seed{$id_split2}; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "SNR_NEXT_SEED\n"; + } + goto BACK; + } + } +INDEL: + $id = $id_original; + $split = ""; + $split_forward = "yes"; + + + if ($jump_rep eq "yes" || $jump_rep_because_stuck eq "yes" || ($extensions_before ne "yes" && $contig_end ne "yes" && $indel_split_skip ne "yes" && ($delete_first eq "" || $delete_second eq "" || $delete_third eq "") && $before ne "yes" && $reference_guided ne "yes")) + { + $count1b_tmp = '0'; + $count2b_tmp = '0'; + $count3b_tmp = '0'; + $count4b_tmp = '0'; + my $before_shorter = ""; + my $overhangb = 1+($read_length/40); + + my $count_one = '0'; + my $count_two = '0'; + my $count_three = '0'; + my $count_four = '0'; + + while (($count_one < 3 || $count_two < 3 || $count_three < 3 || $count_four < 3) && $last_chance ne "yes") + { + $overhangb++; + $count_one = '0'; + $count_two = '0'; + $count_three = '0'; + $count_four = '0'; + + if (@extensions_group1 eq '0') + { + $count_one = '4'; + } + foreach my $one (@extensions_group1) + { + if (length($one) < $overhangb && length($one) > 0) + { + $count_one++; + if (@extensions_group1 < 3) + { + $count_one += 3-@extensions_group1; + } + } + } + if (@extensions_group2 eq '0') + { + $count_two = '4'; + } + foreach my $two (@extensions_group2) + { + if (length($two) < $overhangb && length($two) > 0) + { + $count_two++; + if (@extensions_group2 < 3) + { + $count_two += 3-@extensions_group2; + } + } + } + if ($count_split > 2) + { + if (@extensions_group3 eq '0') + { + $count_three = '4'; + } + foreach my $three (@extensions_group3) + { + if (length($three) < $overhangb && length($three) > 0) + { + $count_three++; + if (@extensions_group3 < 3) + { + $count_three += 3-@extensions_group3; + } + } + } + if ($count_split > 3) + { + if (@extensions_group4 eq '0') + { + $count_four = '4'; + } + foreach my $four (@extensions_group4) + { + if (length($four) < $overhangb && length($four) > 0) + { + $count_four++; + if (@extensions_group4 < 3) + { + $count_four += 3-@extensions_group4; + } + } + } + } + else + { + $count_four = '4'; + } + } + else + { + $count_three = '4'; + $count_four = '4'; + } + } + if ($type eq "mito_plant") + { + $overhangb += 2; + if ($ext_total > $average_coverage_ext*2) + { + $overhangb += 4; + } + } + my $overhang = sprintf("%.0f", $overhangb); + delete $jump_rep{$id}; + my $overhang_for_pairs = $overhang; + if ($overhang_for_pairs > 35 && $average_coverage_ext > 20) + { + $overhang = 35; + } + if ($overhang_for_pairs > 20 && $average_coverage_ext > 65) + { + $overhang = 20; + } + $overhang_check = $overhang; +BEFORE: + my $s = '0'; + + if ($before_shorter eq "yes") + { + $overhang += 5; + } + $before_shorter = ""; + + print OUTPUT5 $overhang." OVERHANG\n"; + my @extensions_yuyu; + undef @extensions_yuyu; + my %extensions_yuyu; + undef %extensions_yuyu; + my %before1F; + my %before2F; + my %before3F; + my %before4F; + my %before1B; + my %before2B; + my %before3B; + my %before4B; + undef %repetitive_pair; + my $skip_overhang = ""; + if ($AT_rich_before eq "yes") + { + my $before_split_tmp = substr $read_short_end2, -($read_length-$overhang); + my $A_rich_test = $before_split_tmp =~ tr/AX\.//; + my $T_rich_test = $before_split_tmp =~ tr/TX\.//; + my $G_rich_test = $before_split_tmp =~ tr/GX\.//; + my $C_rich_test = $before_split_tmp =~ tr/CX\.//; + my $AT_rich_test = $before_split_tmp =~ s/AT//g; + if ($A_rich_test > ($read_length-$overhang-3) || $T_rich_test > ($read_length-$overhang-3) || $G_rich_test > ($read_length-$overhang-3) || $C_rich_test > ($read_length-$overhang-3) || $AT_rich_test > ($read_length-$overhang-6)/2) + { + $skip_overhang = "yes"; + $overhang -= 5; + if ($y > $startprint2) + { + print OUTPUT5 $skip_overhang." SKIP OVERHANG\n"; + } + } + } + my $first_nuc1 = substr $best_extension1, 0, 1; + my $first_nuc2 = substr $best_extension2, 0, 1; + my $first_nuc3 = substr $best_extension3, 0, 1; + my $first_nuc4 = substr $best_extension4, 0, 1; + my $count_test = '0'; + my $count_test1 = '0'; + my $count_test2 = '0'; + my $count_test3 = '0'; + my $count_test4 = '0'; + + foreach my $id_tmp (keys %extensions_for_before) + { + $count_test++; + if (length($extensions_for_before{$id_tmp}) <= $overhang) + { + my $first_nuc = substr $extensions_for_before{$id_tmp}, 0, 1; + if ($first_nuc eq $first_nuc1 && $first_nuc ne "") + { + if (exists($extensions_for_before2{$id_tmp})) + { + my $extensions_for_before2 = $extensions_for_before2{$id_tmp}; + $extensions_for_before2 =~ tr/ACTG/TGAC/; + substr $extensions_for_before2, 0, length($extensions_for_before{$id_tmp}),""; + $count_test1++; + $filter_before1_pair{$id_tmp} = $extensions_for_before2; + $before1B{$id_tmp} = $extensions_for_before2; + $extensions_yuyu{$id_tmp} = $extensions_for_before{$id_tmp}; + } + } + elsif ($first_nuc eq $first_nuc2 && $first_nuc ne "") + { + if (exists($extensions_for_before2{$id_tmp})) + { + my $extensions_for_before2 = $extensions_for_before2{$id_tmp}; + $extensions_for_before2 =~ tr/ACTG/TGAC/; + substr $extensions_for_before2, 0, length($extensions_for_before{$id_tmp}),""; + $count_test2++; + $filter_before2_pair{$id_tmp} = $extensions_for_before2; + $before2B{$id_tmp} = $extensions_for_before2; + $extensions_yuyu{$id_tmp} = $extensions_for_before{$id_tmp} + } + } + elsif ($first_nuc eq $first_nuc3 && $count_split > 2 && $first_nuc ne "") + { + if (exists($extensions_for_before2{$id_tmp})) + { + my $extensions_for_before2 = $extensions_for_before2{$id_tmp}; + $extensions_for_before2 =~ tr/ACTG/TGAC/; + substr $extensions_for_before2, 0, length($extensions_for_before{$id_tmp}),""; + $count_test3++; + $filter_before3_pair{$id_tmp} = $extensions_for_before2; + $before3B{$id_tmp} = $extensions_for_before2; + $extensions_yuyu{$id_tmp} = $extensions_for_before{$id_tmp} + } + } + elsif ($first_nuc eq $first_nuc4 && $count_split > 3 && $first_nuc ne "") + { + if (exists($extensions_for_before2{$id_tmp})) + { + my $extensions_for_before2 = $extensions_for_before2{$id_tmp}; + $extensions_for_before2 =~ tr/ACTG/TGAC/; + substr $extensions_for_before2, 0, length($extensions_for_before{$id_tmp}),""; + $count_test4++; + $filter_before4_pair{$id_tmp} = $extensions_for_before2; + $before4B{$id_tmp} = $extensions_for_before2; + $extensions_yuyu{$id_tmp} = $extensions_for_before{$id_tmp} + } + } + } + } + if ($y > $startprint2) + { + print OUTPUT5 $count_test1." COUNT_TEST1\n"; + print OUTPUT5 $count_test2." COUNT_TEST2\n"; + print OUTPUT5 $count_test3." COUNT_TEST3\n"; + print OUTPUT5 $count_test4." COUNT_TEST4\n"; + print OUTPUT5 $count_test." COUNT_ALL\n"; + } + $s = '0'; + my $before_split = substr $read_short_end2, -($read_length-$left-1), $overhang+$overlap; + my $star3 = '0'; + if ($containX_short_end2 > 0) + { + my $before_split2 = substr $read_short_end2, -($read_length-$left-1); + $star3 = $before_split2 =~ tr/\*//; + if ($star3 > 0) + { + $before_split = substr $read_short_end2, -($read_length-$left-1+($star3*2)), $overhang+$overlap+($star3*2); + } + } + while ($s <= length($before_split)-$overlap) + { + my $line_tmp = substr $before_split, $s, $overlap; + if ($star3 > 0) + { + my $star = $line_tmp =~ tr/\*//; + $line_tmp = substr $before_split, $s, $overlap+($star*2); + my $star2 = $line_tmp =~ tr/\*//; + while ($star2 > $star) + { + $line_tmp = substr $before_split, $s, $overlap+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $line_tmp =~ tr/\*//; + } + } + my %line_tmp = build_partial3b $line_tmp, ""; + foreach my $line (keys %line_tmp) + { + if (exists($hash2b{$line})) + { + my $search0 = $hash2b{$line}; + my $search_rev; + $search0 = substr $search0, 1; + my @search = split /,/,$search0; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + my $found_rev; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + $found_rev = $search_tmp[1]; + $search_rev = $search_tmp."2"; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + $found_rev = $search_tmp[0]; + $search_rev = $search_tmp."1"; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + $found_rev =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + $found_rev = decrypt $found_rev; + } + my $found_new; + my $first_nuc; + my $last_10 = substr $found, -(11+$s), 10; + my $last_10b = substr $found, -(11+$s-($star3)), 10; + + my $last_10_read_end = substr $read_end, -9; + my $check_last10 = $last_10 =~ s/(.)$last_10_read_end/$1$last_10_read_end/; + my $check_last10b = $last_10b =~ s/(.)$last_10_read_end/$1$last_10_read_end/; + if ($check_last10 > 0) + { + $found_new = substr $found, 0, -(1+$s), ""; + $first_nuc = substr $found, 0, 1; + } + elsif ($check_last10b > 0) + { + $found_new = substr $found, 0, -(1+$s-($star3)),""; + $first_nuc = substr $found, 0, 1; + } + if ($check_last10 > 0 || $check_last10b > 0) + { + my $first_nuc1 = substr $best_extension1, 0, 1; + my $first_nuc2 = substr $best_extension2, 0, 1; + my $first_nuc3 = substr $best_extension3, 0, 1; + my $first_nuc4 = substr $best_extension4, 0, 1; + my $extension_yuyu = $found; + $extensions_yuyu{$search} = $extension_yuyu; + if ($first_nuc eq $first_nuc1 && ($check_last10 > 0 || $check_last10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + my $found_tmp = reverse($found_new); + $before1F{$search} = $found_tmp; + $found_rev =~ tr/ACTG/TGAC/; + my $found_rev2 = reverse($found_rev); + $repetitive_pair{$found_rev2} = $search_rev; + } + elsif ($first_nuc eq $first_nuc2 && ($check_last10 > 0 || $check_last10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + my $found_tmp = reverse($found_new); + $before2F{$search} = $found_tmp; + $found_rev =~ tr/ACTG/TGAC/; + my $found_rev2 = reverse($found_rev); + $repetitive_pair{$found_rev2} = $search_rev; + } + elsif ($first_nuc eq $first_nuc3 && ($check_last10 > 0 || $check_last10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + my $found_tmp = reverse($found_new); + $before3F{$search} = $found_tmp; + $found_rev =~ tr/ACTG/TGAC/; + my $found_rev2 = reverse($found_rev); + $repetitive_pair{$found_rev2} = $search_rev; + } + elsif ($first_nuc eq $first_nuc4 && ($check_last10 > 0 || $check_last10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + my $found_tmp = reverse($found_new); + $before4F{$search} = $found_tmp; + $found_rev =~ tr/ACTG/TGAC/; + my $found_rev2 = reverse($found_rev); + $repetitive_pair{$found_rev2} = $search_rev; + } + if ($save_reads ne "") + { + my $add_read = $search_tmp; + if (exists($save_reads{$add_read})) + { + } + else + { + $save_reads{$add_read} = undef; + if ($save_reads eq "2") + { + my $add_read2 = $map_ids{$add_read}; + print OUTPUT10 $add_read2."\/1\n"; + print OUTPUT11 $add_read2."\/2\n"; + } + else + { + print OUTPUT10 ">".$add_read."\/1\n"; + print OUTPUT11 ">".$add_read."\/2\n"; + } + if (exists($hash{$add_read})) + { + my @add_read = split /,/,$hash{$add_read}; + my $forward = $add_read[0]; + my $reverse = $add_read[1]; + if ($use_quality ne "") + { + $forward =~tr/1234/ACTG/; + $reverse =~tr/1234/ACTG/; + } + print OUTPUT10 $forward."\n"; + print OUTPUT11 $reverse."\n"; + } + } + } + + } + } + } + } + } + $s++; + } + my %before_all1 = (%before1B, %before1F); + my %before_all2 = (%before2B, %before2F); + my %before_all3 = (%before3B, %before3F); + my %before_all4 = (%before4B, %before4F); + + + my $end_short_tmp = substr $read_short_end2, -($read_length+20); + +BEFORE_EXTRA: + if ($y > $startprint2) + { + print OUTPUT5 $end_short_tmp." END_SHORT_TMP\n\n"; + print OUTPUT5 reverse($end_short_tmp)." REVERSE_END_SHORT_TMP\n\n"; + } + my $first_yuyu = ""; + my $second_yuyu = ""; + my $third_yuyu = ""; + my $fourth_yuyu = ""; + my $first_yuyu2 = '0'; + my $second_yuyu2 = '0'; + my $third_yuyu2 = '0'; + my $fourth_yuyu2 = '0'; + my @filter_dot_before1; + my @filter_dot_before2; + my @filter_dot_before3; + my @filter_dot_before4; + undef @filter_dot_before1; + undef @filter_dot_before2; + undef @filter_dot_before3; + undef @filter_dot_before4; + my @extensions_before; + my @extensions_before1; + my @extensions_before2; + my @extensions_before3; + my @extensions_before4; + undef @extensions_before; + undef @extensions_before1; + undef @extensions_before2; + undef @extensions_before3; + undef @extensions_before4; + + my $end_short_tmp_part = substr $end_short_tmp, -($read_length-$overhang-5-1); + my $star_first = substr $end_short_tmp_part, 0, 1; + if ($star_first eq "*") + { + $end_short_tmp_part = substr $end_short_tmp, -($read_length-$overhang-5-1+1); + } + + foreach my $search (keys %before_all2) + { + my $overhang_tmp = $overhang; + my $yuyu0 = $before_all2{$search}; + my $yuyu2 = reverse($yuyu0); + if (length($yuyu2) >= $read_length-$overhang_tmp-5-1) + { + $yuyu2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $yuyu2_tmp = $yuyu2; + my $check_yuyuy2 = $yuyu2_tmp =~ s/.$end_short_tmp_part/$end_short_tmp_part/; + + if ($check_yuyuy2 > 0) + { + $second_yuyu = "yes"; + $second_yuyu2++; + my $end_short_tmp_part = substr $end_short_tmp, -length($yuyu2)+1; + my $end_short_tmp_part2; + my $star = $end_short_tmp_part =~ tr/\*//; + my $check_yuyuy2b2 = '0'; + if ($star > 0) + { + $end_short_tmp_part = substr $end_short_tmp, -length($yuyu2)+1-($star*2); + $end_short_tmp_part2 = substr $end_short_tmp, -length($yuyu2)+1-($star); + my $yuyu2_tmp2 = $yuyu2; + $check_yuyuy2b2 = $yuyu2_tmp2 =~ s/.$end_short_tmp_part2/$end_short_tmp_part2/; + } + my $yuyu2_tmp3 = $yuyu2; + my $check_yuyuy2b = $yuyu2_tmp3 =~ s/.$end_short_tmp_part/$end_short_tmp_part/; + if ($check_yuyuy2b > 0 || $check_yuyuy2b2 > 0) + { + push @filter_dot_before2, $yuyu0; + if (exists($extensions_yuyu{$search})) + { + if ($y > $startprint2) + { + print OUTPUT5 $yuyu0." FOUND2 "; + print OUTPUT5 $extensions_yuyu{$search}." EXTe2\n"; + } + push @extensions_before, $extensions_yuyu{$search}; + push @extensions_before2, $extensions_yuyu{$search}; + } + $filter_before2{$search} = undef; + } + } + } + } + foreach my $search (keys %before_all1) + { + my $overhang_tmp = $overhang; + my $yuyu0 = $before_all1{$search}; + my $yuyu1 = reverse($yuyu0); + if (length($yuyu1) >= $read_length-$overhang_tmp-5-1) + { + $yuyu1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $yuyu1_tmp = $yuyu1; + my $check_yuyuy = $yuyu1_tmp =~ s/.$end_short_tmp_part/$end_short_tmp_part/; + + if ($check_yuyuy > 0) + { + $first_yuyu2++; + $first_yuyu = "yes"; + my $end_short_tmp_part = substr $end_short_tmp, -length($yuyu1)+1; + my $star = $end_short_tmp_part =~ tr/\*//; + my $check_yuyuy1b2 = '0'; + my $end_short_tmp_part2; + if ($star > 0) + { + $end_short_tmp_part = substr $end_short_tmp, -length($yuyu1)+1-($star*2); + $end_short_tmp_part2 = substr $end_short_tmp, -length($yuyu1)+1-($star); + my $yuyu1_tmp2 = $yuyu1; + $check_yuyuy1b2 = $yuyu1_tmp2 =~ s/.$end_short_tmp_part2/$end_short_tmp_part2/; + } + my $yuyu1_tmp3 = $yuyu1; + my $check_yuyuy1b = $yuyu1_tmp3 =~ s/.$end_short_tmp_part/$end_short_tmp_part/; + if ($check_yuyuy1b > 0 || $check_yuyuy1b2 > 0) + { + push @filter_dot_before1, $yuyu0; + if (exists($extensions_yuyu{$search})) + { + if ($y > $startprint2) + { + print OUTPUT5 $yuyu0." FOUND1 "; + print OUTPUT5 $extensions_yuyu{$search}." EXTe1\n"; + } + push @extensions_before, $extensions_yuyu{$search}; + push @extensions_before1, $extensions_yuyu{$search}; + } + $filter_before1{$search} = undef; + } + } + } + } + foreach my $search (keys %before_all3) + { + my $overhang_tmp = $overhang; + my $yuyu0 = $before_all3{$search}; + my $yuyu3 = reverse($yuyu0); + if (length($yuyu3) >= $read_length-$overhang_tmp-5-1) + { + $yuyu3 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $yuyu3_tmp = $yuyu3; + my $check_yuyuy = $yuyu3_tmp =~ s/.$end_short_tmp_part/$end_short_tmp_part/; + + if ($check_yuyuy > 0) + { + $third_yuyu2++; + $third_yuyu = "yes"; + my $end_short_tmp_part = substr $end_short_tmp, -length($yuyu3)+1; + my $star = $end_short_tmp_part =~ tr/\*//; + my $check_yuyuy3b2 = '0'; + my $end_short_tmp_part2; + if ($star > 0) + { + $end_short_tmp_part = substr $end_short_tmp, -length($yuyu3)+1-($star*2); + $end_short_tmp_part2 = substr $end_short_tmp, -length($yuyu3)+1-($star); + my $yuyu3_tmp2 = $yuyu3; + $check_yuyuy3b2 = $yuyu3_tmp2 =~ s/.$end_short_tmp_part2/$end_short_tmp_part2/; + } + my $yuyu3_tmp3 = $yuyu3; + my $check_yuyuy3b = $yuyu3_tmp3 =~ s/.$end_short_tmp_part/$end_short_tmp_part/; + if ($check_yuyuy3b > 0 || $check_yuyuy3b2 > 0) + { + push @filter_dot_before3, $yuyu0; + if (exists($extensions_yuyu{$search})) + { + if ($y > $startprint2) + { + print OUTPUT5 $yuyu0." FOUND3 "; + print OUTPUT5 $extensions_yuyu{$search}." EXTe3\n"; + } + push @extensions_before, $extensions_yuyu{$search}; + push @extensions_before3, $extensions_yuyu{$search}; + } + $filter_before3{$search} = undef; + } + } + } + } + foreach my $search (keys %before_all4) + { + my $overhang_tmp = $overhang; + my $yuyu0 = $before_all4{$search}; + my $yuyu4 = reverse($yuyu0); + if (length($yuyu4) >= $read_length-$overhang_tmp-5-1) + { + $yuyu4 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $yuyu4_tmp = $yuyu4; + my $check_yuyuy = $yuyu4_tmp =~ s/.$end_short_tmp_part/$end_short_tmp_part/; + + if ($check_yuyuy > 0) + { + $fourth_yuyu2++; + $fourth_yuyu = "yes"; + my $end_short_tmp_part = substr $end_short_tmp, -length($yuyu4)+1; + my $star = $end_short_tmp_part =~ tr/\*//; + my $check_yuyuy4b2 = '0'; + my $end_short_tmp_part2; + if ($star > 0) + { + $end_short_tmp_part = substr $end_short_tmp, -length($yuyu4)+1-($star*2); + $end_short_tmp_part2 = substr $end_short_tmp, -length($yuyu4)+1-($star); + my $yuyu4_tmp2 = $yuyu4; + $check_yuyuy4b2 = $yuyu4_tmp2 =~ s/.$end_short_tmp_part2/$end_short_tmp_part2/; + } + my $yuyu4_tmp3 = $yuyu4; + my $check_yuyuy4b = $yuyu4_tmp3 =~ s/.$end_short_tmp_part/$end_short_tmp_part/; + if ($check_yuyuy4b > 0 || $check_yuyuy4b2 > 0) + { + push @filter_dot_before4, $yuyu0; + if (exists($extensions_yuyu{$search})) + { + if ($y > $startprint2) + { + print OUTPUT5 $yuyu0." FOUND4 "; + print OUTPUT5 $extensions_yuyu{$search}." EXTe4\n"; + } + push @extensions_before, $extensions_yuyu{$search}; + push @extensions_before4, $extensions_yuyu{$search}; + } + $filter_before4{$search} = undef; + } + } + } + } + + if ($y > $startprint2) + { + print OUTPUT5 $first_yuyu2." FIRST_YUYU\n"; + print OUTPUT5 $second_yuyu2." SECOND_YUYU\n"; + print OUTPUT5 $third_yuyu2." THIRD_YUYU\n"; + print OUTPUT5 $fourth_yuyu2." FOURTH_YUYU\n"; + } + my $correction = '0'; + my $count_all = '0'; + $count_all = (@filter_dot_before1) + (@filter_dot_before2) + (@filter_dot_before3) + (@filter_dot_before4); + my $end_short_tmp2 = substr $read_short_end2, -($read_length-$overhang-5-1); + my $check_dot = $end_short_tmp2 =~ tr/\./\./; + if ($check_dot eq "") + { + $check_dot = '0' + } + my $check_star = $end_short_tmp2 =~ tr/\*/\*/; + my $check_star2 = ""; + my $deletion = ""; +print OUTPUT5 $check_star." CHECK_STAR\n"; +print OUTPUT5 $count_all." COUNT_ALL\n"; + if ($count_all > 2 && ($check_dot > 0 || $check_star > 0) && $type ne "mito_plant" && $heteroplasmy eq "") + { + my @split_dot = split /\./, reverse($end_short_tmp2); + my $size = '0'; + my $size2 = '1'; + my $length_total = '0'; + my @filter_dot_before_all; + undef @filter_dot_before_all; + @filter_dot_before_all = (@filter_dot_before1, @filter_dot_before2, @filter_dot_before3, @filter_dot_before4); + + if ($check_star > 0) + { + my @deletion = split //, reverse($end_short_tmp2); + my $found_star = ""; + foreach my $nuc (@deletion) + { + if ($found_star eq "yes") + { + $found_star = ""; + $deletion .= $nuc; + } + if ($nuc eq "*") + { + $found_star = "yes"; + } + } + print OUTPUT5 $deletion." NUC0\n"; + } + foreach my $dot_split (@split_dot) + { + $size++; + } + foreach my $dot_split (@split_dot) + { + if ($size2 ne $size) + { + $length_total += length($dot_split); + my $A = '0'; + my $C = '0'; + my $T = '0'; + my $G = '0'; + my $check_star2_part = ""; + if ($check_star > 0) + { + $check_star2_part = substr reverse($end_short_tmp2), 0, $length_total; + $check_star2 = $check_star2_part =~ tr/\*/\*/; + print OUTPUT5 $check_star2." CHECK_STAR2\n"; + } + foreach my $dot_before2 (@filter_dot_before_all) + { + my $dot = substr $dot_before2, $length_total, 1; + if ($check_star2 > 0) + { + print OUTPUT5 $deletion." DEL_DOT\n"; + my $check_star2_part_tmp1 = $check_star2_part; + $check_star2_part_tmp1 =~ tr/\*//d; + my $dot_before2_tmp = $dot_before2; + my $check_star1 = $dot_before2_tmp =~ s/.$check_star2_part_tmp1/$check_star2_part_tmp1/; + if ($check_star1 > 0) + { + $dot = substr $dot_before2, $length_total-$check_star2, 1; + } + else + { + $dot = substr $dot_before2, $length_total-($check_star2*2), 1; + } + } + if ($dot eq "A") + { + $A++; + } + elsif ($dot eq "C") + { + $C++; + } + elsif ($dot eq "T") + { + $T++; + } + elsif ($dot eq "G") + { + $G++; + } + } + if (($A > 2 && ($C+$T+$G) eq 0) || ($A > ($C+$T+$G)*10 && $repetitive_detect eq "")) + { + substr $read, -$length_total-1, 1, "A"; + print OUTPUT5 "DOT AAAAAAAAAAAA\n"; + $correction++; + } + if (($C > 2 && ($A+$T+$G) eq 0) || ($C > ($A+$T+$G)*10 && $repetitive_detect eq "")) + { + substr $read, -$length_total-1, 1, "C"; + print OUTPUT5 "DOT CCCCCCCCCCCC\n"; + $correction++; + } + if (($T > 2 && ($C+$A+$G) eq 0) || ($T > ($C+$A+$G)*10 && $repetitive_detect eq "")) + { + substr $read, -$length_total-1, 1, "T"; + print OUTPUT5 "DOT TTTTTTTTTTTT\n"; + $correction++; + } + if (($G > 2 && ($C+$T+$A) eq 0) || ($G > ($C+$T+$A)*10 && $repetitive_detect eq "")) + { + substr $read, -$length_total-1, 1, "G"; + print OUTPUT5 "DOT GGGGGGGGGGGGG\n"; + $correction++; + } + $length_total += 1; + } + $size2++; + } + } + if ($jump_rep_because_stuck eq "yes") + { + goto JUMP_REP; + } + + if ($first_yuyu eq "yes" && $second_yuyu ne "yes" && $third_yuyu ne "yes" && $fourth_yuyu ne "yes" && $first_yuyu2 > 3 && ($correction eq $check_dot || $overhang >= $read_length-($overlap+15))) + { + if (@extensions_before > 3000000) + { + $ext_before = "yes"; + @extensions_before = @extensions_before1; + goto EXT_BEFORE; + } + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $best_extension = $best_extension1; + if ($y > $startprint2) + { + print OUTPUT5 "\nONLY FIRST\n\n"; + } + delete $SNP_active{$id}; + delete $before{$id}; + delete $before_shorter_skip{$id}; + goto AFTER_EXT; + } + elsif ($second_yuyu eq "yes" && $first_yuyu ne "yes" && $third_yuyu ne "yes" && $fourth_yuyu ne "yes" && $second_yuyu2 > 3 && ($correction eq $check_dot || $overhang >= $read_length-($overlap+15))) + { + if (@extensions_before > 3000000) + { + $ext_before = "yes"; + @extensions_before = @extensions_before2; + goto EXT_BEFORE; + } + delete $seed{$id}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nONLY SECOND\n\n"; + } + delete $SNP_active{$id_split1}; + delete $before{$id_split1}; + delete $before_shorter_skip{$id_split1}; + $id = $id_split1; + if ($correction > 0) + { + delete $seed{$id_split1}; + $best_extension = $best_extension2; + goto AFTER_EXT; + } + else + { + $read_new = $seed{$id_split1}; + $read_new1 = $read_new; + $best_extension = $best_extension2; + $ref_skip_before = "yes"; + goto BACK; + } + } + elsif ($third_yuyu eq "yes" && $first_yuyu ne "yes" && $second_yuyu ne "yes" && $fourth_yuyu ne "yes" && $third_yuyu2 > 3 && ($correction eq $check_dot || $overhang >= $read_length-($overlap+15))) + { + if (@extensions_before > 300000) + { + $ext_before = "yes"; + @extensions_before = @extensions_before3; + goto EXT_BEFORE; + } + delete $seed{$id}; + delete $seed{$id_split3}; + delete $seed{$id_split1}; + if ($y > $startprint2) + { + print OUTPUT5 "\nONLY THIRD\n\n"; + } + delete $SNP_active{$id_split2}; + delete $before{$id_split2}; + delete $before_shorter_skip{$id_split2}; + $id = $id_split2; + if ($correction > 0) + { + delete $seed{$id_split2}; + $best_extension = $best_extension3; + goto AFTER_EXT; + } + else + { + $read_new = $seed{$id_split2}; + $read_new1 = $read_new; + $best_extension = $best_extension3; + $ref_skip_before = "yes"; + goto BACK; + } + } + elsif ($fourth_yuyu eq "yes" && $first_yuyu ne "yes" && $third_yuyu ne "yes" && $second_yuyu ne "yes" && $fourth_yuyu2 > 3 && ($correction eq $check_dot || $overhang >= $read_length-($overlap+15))) + { + if (@extensions_before > 3000000) + { + $ext_before = "yes"; + @extensions_before = @extensions_before4; + goto EXT_BEFORE; + } + delete $seed{$id}; + delete $seed{$id_split2}; + delete $seed{$id_split1}; + if ($y > $startprint2) + { + print OUTPUT5 "\nONLY FOURTH\n\n"; + } + delete $SNP_active{$id_split3}; + delete $before{$id_split3}; + delete $before_shorter_skip{$id_split3}; + $id = $id_split3; + if ($correction > 0) + { + $best_extension = $best_extension4; + delete $seed{$id_split3}; + goto AFTER_EXT; + } + else + { + $read_new = $seed{$id_split3}; + $read_new1 = $read_new; + $best_extension = $best_extension4; + $ref_skip_before = "yes"; + goto BACK; + } + } + elsif($count_all < 3 && $overhang < $read_length-($overlap+15) && $skip_overhang ne "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nMAKE BEFORE SHORTER\n\n"; + } + $before_shorter = "yes"; + goto BEFORE; + } + else + { + my %count1234; + undef %count1234; + my %count1234b; + undef %count1234b; + + my $count1 = '0'; + my $count2 = '0'; + my $count3 = '0'; + my $count4 = '0'; + my $count1_pair = '0'; + my $count2_pair = '0'; + my $count3_pair = '0'; + my $count4_pair = '0'; + foreach my $exb (keys %filter_before1) + { + my $search_tmp = substr $exb, 0, -1; + my $search_end = substr $exb, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + my $found2; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + $found2 = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + $found2 = $search_tmp[1]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + $found2 =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + $found2 = decrypt $found2; + } + $count1++; + if (exists($filter_before1{$exb})) + { + if (exists($filter_before1_pair{$exb})) + { + $filter_before1{$exb} = $found; + $count1_pair++; + if ($y > $startprint2) + { + print OUTPUT5 $found." 1\n"; + } + } + else + { + delete $filter_before1{$exb}; + } + } + } + } + foreach my $exb (keys %filter_before2) + { + my $search_tmp = substr $exb, 0, -1; + my $search_end = substr $exb, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + my $found2; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + $found2 = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + $found2 = $search_tmp[1]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + $found2 =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + $found2 = decrypt $found2; + } + $count2++; + if (exists($filter_before2{$exb})) + { + if (exists($filter_before2_pair{$exb})) + { + $filter_before2{$exb} = $found; + $count2_pair++; + if ($y > $startprint2) + { + print OUTPUT5 $found." 2\n"; + } + } + else + { + delete $filter_before2{$exb}; + } + } + } + } + foreach my $exb (keys %filter_before3) + { + my $search_tmp = substr $exb, 0, -1; + my $search_end = substr $exb, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $count3++; + if (exists($filter_before3{$exb})) + { + if (exists($filter_before3_pair{$exb})) + { + $filter_before3{$exb} = $found; + $count3_pair++; + if ($y > $startprint2) + { + print OUTPUT5 $found." 3\n"; + } + } + else + { + delete $filter_before3{$exb}; + } + } + } + } + foreach my $exb (keys %filter_before4) + { + my $search_tmp = substr $exb, 0, -1; + my $search_end = substr $exb, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $count4++; + if (exists($filter_before4{$exb})) + { + if (exists($filter_before4_pair{$exb})) + { + $filter_before4{$exb} = $found; + $count4_pair++; + if ($y > $startprint2) + { + print OUTPUT5 $found." 4\n"; + } + } + else + { + delete $filter_before4{$exb}; + } + } + } + } + if ($y > $startprint2) + { + print OUTPUT5 $count1." COUNT1\n"; + print OUTPUT5 $count2." COUNT2\n"; + print OUTPUT5 $count3." COUNT3\n"; + print OUTPUT5 $count4." COUNT4\n"; + } + $count1234{'1'} = $count1_pair; + $count1234{'2'} = $count2_pair; + $count1234{'3'} = $count3_pair; + $count1234{'4'} = $count4_pair; + + my $morethan3 = '0'; + my $difference = ""; + + foreach my $count1234 (keys %count1234) + { + if ($count1234{$count1234} > 3) + { + $morethan3++; + } + } + my $best_extension1_tmp = $best_extension1; + my $best_extension2_tmp = $best_extension2; + my $SNR_check1 = $best_extension1_tmp =~ s/AAAAAAAA|CCCCCCCC|GGGGGGGG|TTTTTTTT//; + my $SNR_check2 = $best_extension2_tmp =~ s/AAAAAAAA|CCCCCCCC|GGGGGGGG|TTTTTTTT//; + if ($repetitive_detect ne "") + { + $SNR_check1 = ""; + $SNR_check2 = ""; + } + my $GGGG2 = $best_extension2 =~ tr/G/G/; + my $TTTT2 = $best_extension2 =~ tr/T/T/; + my $CCCC2 = $best_extension2 =~ tr/C/C/; + my $AAAA2 = $best_extension2 =~ tr/A/A/; + if ((($GGGG2 eq length($best_extension2) || $TTTT2 eq length($best_extension2) || $CCCC2 eq length($best_extension2) || $AAAA2 eq length($best_extension2)) && length($best_extension2) > 2) || length($best_extension2)*0.58 < $AAAA2 || length($best_extension2)*0.58 < $CCCC2 || length($best_extension2)*0.58 < $TTTT2 || length($best_extension2)*0.58 < $GGGG2) + { + $SNR_check2 = '1'; + } + my $GGGG1 = $best_extension1 =~ tr/G/G/; + my $TTTT1 = $best_extension1 =~ tr/T/T/; + my $CCCC1 = $best_extension1 =~ tr/C/C/; + my $AAAA1 = $best_extension1 =~ tr/A/A/; + if ((($GGGG1 eq length($best_extension1) || $TTTT1 eq length($best_extension1) || $CCCC1 eq length($best_extension1) || $AAAA1 eq length($best_extension1)) && length($best_extension1) > 2) || length($best_extension1)*0.58 < $AAAA1 || length($best_extension1)*0.58 < $CCCC1 || length($best_extension1)*0.58 < $TTTT1 || length($best_extension1)*0.58 < $GGGG1) + { + $SNR_check1 = '1'; + } + my $h = '8'; + if ($type eq "mito_plant" && ($count1+$count2+$count3+$count4) > $average_coverage_ext*0.8) + { + $h = '20'; + } + if ($type eq "mito_plant" && ($count1+$count2+$count3+$count4) > $average_coverage_ext*4) + { + $h = ($count1+$count2+$count3+$count4)/($average_coverage_ext/6); + } + if ((($count1 > 3 && ($count2+$count3+$count4) eq 0) || ($count1 > ($count2+$count3+$count4)*$h && ($count2+$count3+$count4) ne 0)) && $repetitive_detect2 ne "yes" && $correction eq $check_dot) + { + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $best_extension = $best_extension1; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE9\n\n"; + } + delete $SNP_active{$id}; + delete $before{$id}; + delete $before_shorter_skip{$id}; + goto AFTER_EXT; + } + elsif ((($count2 > 3 && ($count1+$count3+$count4) eq 0) || ($count2 > ($count1+$count3+$count4)*$h && ($count1+$count3+$count4) ne 0)) && $repetitive_detect2 ne "yes" && $correction eq $check_dot) + { + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE10\n\n"; + } + delete $SNP_active{$id_split1}; + delete $before{$id_split1}; + delete $before_shorter_skip{$id_split1}; + $id = $id_split1; + + $best_extension = $best_extension2; + goto AFTER_EXT; + } + elsif ((($count3 > 3 && ($count1+$count2+$count4) eq 0) || ($count3 > ($count1+$count2+$count4)*$h && ($count1+$count2+$count4) ne 0)) && $repetitive_detect2 ne "yes" && $correction eq $check_dot) + { + delete $seed{$id}; + delete $seed{$id_split3}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE11\n\n"; + } + delete $SNP_active{$id_split2}; + delete $before{$id_split2}; + delete $before_shorter_skip{$id_split2}; + $id = $id_split2; + $best_extension = $best_extension3; + goto AFTER_EXT; + } + elsif ((($count4 > 3 && ($count1+$count3+$count2) eq 0) || ($count4 > ($count1+$count3+$count2)*$h && ($count1+$count3+$count2) ne 0)) && $repetitive_detect2 ne "yes" && $correction eq $check_dot) + { + delete $seed{$id}; + delete $seed{$id_split2}; + delete $seed{$id_split1}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE12\n\n"; + } + delete $SNP_active{$id_split3}; + delete $before{$id_split3}; + delete $before_shorter_skip{$id_split3}; + $id = $id_split3; + + $best_extension = $best_extension4; + goto AFTER_EXT; + } + elsif((($morethan3 eq 0 && $overhang < $read_length-($overlap*1.2)) || ($morethan3 > 0 && length($read) <= $insert_size && $overhang < 30)) && $skip_overhang ne "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nMAKE BEFORE SHORTER2\n\n"; + } + $before_shorter = "yes"; + goto BEFORE; + } + elsif ($overhang_for_pairs > $overhang && $average_coverage_ext < 80 && $skip_overhang eq "") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nINCREASE_OVERHANG\n\n"; + } + $overhang = $overhang_for_pairs; + goto BEFORE; + } + elsif ($morethan3 > 0 && $last_chance ne "yes" && (($SNR_check1 eq "" && $SNR_check2 eq "") || $overhang < 30)) + { + my $count1b = '0'; + my $count2b = '0'; + my $count3b = '0'; + my $count4b = '0'; + + my $size = keys %read_short_end_tmp; + undef @extensions_before; + undef @extensions_before1; + undef @extensions_before2; + undef @extensions_before3; + undef @extensions_before4; + + my $insert_range_before = 1.65; + if ($insert_range_shorter eq "yes") + { + $insert_range_before = 1.45; + } + my $buffer_front = '11'; + my $buffer_back = $right+2; + if ($insert_range_shorter eq "yes") + { + $buffer_front = '6'; + $buffer_back = '8'; + } + + my $F = $insert_size - (($insert_size*($insert_range_before-0.2))-$insert_size) - $read_length - $overhang + $buffer_front; + if ($F <= 0) + { + $F = '1'; + } + + my $read_short_end_tempie = substr $read, -(($insert_size*$insert_range_before)-$overhang), -$F; + $read_short_end_tempie =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + undef %read_short_end_tmp; + + %read_short_end_tmp = build_partial3c $read_short_end_tempie; + if ($y > $startprint2) + { + print OUTPUT5 $read_short_end_tempie." READ_SHORT\n"; + } + + my $ff = '0'; + my %hash_read_short_end; + undef %hash_read_short_end; + foreach my $read_short_end_tempie (keys %read_short_end_tmp) + { + $ff = '0'; + while ($ff < (length($read_short_end_tempie)-$read_length+$left+$right+2)) + { + my $read_short_end_part = substr $read_short_end_tempie, $ff, $read_length-$left-$right-1; + + $hash_read_short_end{$read_short_end_part} = undef; + $ff++; + } + } + + foreach my $exb0 (keys %filter_before1) + { + my $exb = $filter_before1{$exb0}; + my $match_pair_middle = substr $exb, $left, $read_length-$left-$right; +FILTER_1: foreach my $line (keys %hash_read_short_end) + { + my $found_seq = '0'; + my $match_pair_middle_tmp = $match_pair_middle; + + $found_seq = $match_pair_middle_tmp =~ s/.$line/$line/; + if ($found_seq > 0) + { + if (exists($filter_before1_pair{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $filter_before1_pair{$exb0}." FOUND1\n"; + print OUTPUT5 $exb." FOUND1_PAIR\n"; + } + if (exists($extensions_yuyu{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $extensions_yuyu{$exb0}." EXT1\n"; + } + push @extensions_before, $extensions_yuyu{$exb0}; + push @extensions_before1, $extensions_yuyu{$exb0}; + } + } + $count1b++; + last FILTER_1; + } + } + } + foreach my $exb0 (keys %filter_before2) + { + my $exb = $filter_before2{$exb0}; + my $match_pair_middle = substr $exb, $left, $read_length-$left-$right; +FILTER_2: foreach my $line (keys %hash_read_short_end) + { + my $found_seq = '0'; + my $match_pair_middle_tmp = $match_pair_middle; + + $found_seq = $match_pair_middle_tmp =~ s/.$line/$line/; + if ($found_seq > 0) + { + if (exists($filter_before2_pair{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $filter_before2_pair{$exb0}." FOUND2\n"; + print OUTPUT5 $exb." FOUND2_PAIR\n"; + } + if (exists($extensions_yuyu{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $extensions_yuyu{$exb0}." EXT2\n"; + } + push @extensions_before, $extensions_yuyu{$exb0}; + push @extensions_before2, $extensions_yuyu{$exb0}; + } + } + $count2b++; + last FILTER_2; + } + } + } + foreach my $exb0 (keys %filter_before3) + { + my $exb = $filter_before3{$exb0}; + my $match_pair_middle = substr $exb, $left, $read_length-$left-$right; +FILTER_3: foreach my $line (keys %hash_read_short_end) + { + my $found_seq = '0'; + my $match_pair_middle_tmp = $match_pair_middle; + + $found_seq = $match_pair_middle_tmp =~ s/.$line/$line/; + if ($found_seq > 0) + { + if (exists($filter_before3_pair{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $filter_before3_pair{$exb0}." FOUND3\n"; + print OUTPUT5 $exb." FOUND3_PAIR\n"; + } + if (exists($extensions_yuyu{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $extensions_yuyu{$exb0}." EXT3\n"; + } + push @extensions_before, $extensions_yuyu{$exb0}; + push @extensions_before3, $extensions_yuyu{$exb0} + } + } + $count3b++; + last FILTER_3; + } + } + } + foreach my $exb0 (keys %filter_before4) + { + my $exb = $filter_before4{$exb0}; + my $match_pair_middle = substr $exb, $left, $read_length-$left-$right; +FILTER_4: foreach my $line (keys %hash_read_short_end) + { + my $found_seq = '0'; + my $match_pair_middle_tmp = $match_pair_middle; + + $found_seq = $match_pair_middle_tmp =~ s/.$line/$line/; + if ($found_seq > 0) + { + if (exists($filter_before4_pair{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $filter_before4_pair{$exb0}." FOUND4\n"; + print OUTPUT5 $exb." FOUND4_PAIR\n"; + } + if (exists($extensions_yuyu{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $extensions_yuyu{$exb0}." EXT4\n"; + } + push @extensions_before, $extensions_yuyu{$exb0}; + push @extensions_before4, $extensions_yuyu{$exb0}; + } + } + $count4b++; + last FILTER_4; + } + } + } + if ($y > $startprint2) + { + print OUTPUT5 $count1b." COUNT1B\n"; + print OUTPUT5 $count2b." COUNT2B\n"; + print OUTPUT5 $count3b." COUNT3B\n"; + print OUTPUT5 $count4b." COUNT4B\n"; + } + $count1b_tmp = $count1b; + $count2b_tmp = $count2b; + $count3b_tmp = $count3b; + $count4b_tmp = $count4b; + my $f = '6'; + if ($SNR_check1 ne "" || $SNR_check2 ne "") + { + $f = '8'; + } + if ($repetitive_detect ne "") + { + $f = '10'; + } + if ($type eq "mito_plant") + { + $f = '10'; + } + if ($type eq "mito_plant" && ($count1b+$count2b+$count3b+$count4b) > $average_coverage_ext/2) + { + $f = '25'; + } + if ($type eq "mito_plant" && ($count1b+$count2b+$count3b+$count4b) > $average_coverage_ext) + { + $f = '32'; + } + my $dup = ""; + my $r = '4'; + if ($type eq "mito_plant" && ($count1b+$count2b+$count3b+$count4b) > $average_coverage_ext*3) + { + $dup = "yes"; + $r = 9; + } + if ($type eq "mito_plant" && ($count1b+$count2b+$count3b+$count4b) > $average_coverage_ext*3) + { + $dup = "yes"; + $r = 12; + } + + $count1234b{'1'} = $count1b; + $count1234b{'2'} = $count2b; + $count1234b{'3'} = $count3b; + $count1234b{'4'} = $count4b; + + my $differenceb1 = ""; + my $differenceb2 = ""; + my $differenceb3 = ""; + my $differenceb4 = ""; + + foreach my $count1234b (keys %count1234b) + { + if ($count1234b ne '1' && $count1234b{$count1234b} > 0 && $count1b > $f*$count1234b{$count1234b}) + { + $differenceb1 = "yes"; + } + if ($count1234b ne '2' && $count1234b{$count1234b} > 0 && $count2b > $f*$count1234b{$count1234b}) + { + $differenceb2 = "yes"; + } + if ($count1234b ne '3' && $count1234b{$count1234b} > 0 && $count3b > $f*$count1234b{$count1234b}) + { + $differenceb3 = "yes"; + } + if ($count1234b ne '4' && $count1234b{$count1234b} > 0 && $count4b > $f*$count1234b{$count1234b}) + { + $differenceb4 = "yes"; + } + } + if ((($count1b > 2 && ($count2b+$count3b+$count4b) eq '0') || ($differenceb1 eq "yes" && $differenceb2 eq "" && $differenceb3 eq "" && $differenceb4 eq "")) && ($dup ne "yes" || ($count2b+$count3b+$count4b) < $average_coverage_ext/$r)) + { + if (@extensions_before > 3000000) + { + $ext_before = "yes"; + @extensions_before = @extensions_before1; + goto EXT_BEFORE; + } + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $best_extension = $best_extension1; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE5\n\n"; + } + delete $SNP_active{$id}; + delete $before{$id}; + delete $before_shorter_skip{$id}; + goto AFTER_EXT; + } + elsif ((($count2b > 2 && ($count1b+$count3b+$count4b) eq '0') || ($differenceb2 eq "yes" && $differenceb1 eq "" && $differenceb3 eq "" && $differenceb4 eq "")) && ($dup ne "yes" || ($count1b+$count3b+$count4b) < $average_coverage_ext/$r)) + { + if (@extensions_before > 3000000) + { + $ext_before = "yes"; + @extensions_before = @extensions_before2; + goto EXT_BEFORE; + } + delete $seed{$id}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE6\n\n"; + } + delete $SNP_active{$id_split1}; + delete $before_shorter_skip{$id_split1}; + delete $before{$id_split1}; + $id = $id_split1; + if ($correction > 0) + { + delete $seed{$id_split1}; + $best_extension = $best_extension2; + goto AFTER_EXT; + } + else + { + $read_new = $seed{$id_split1}; + $read_new1 = $read_new; + goto BACK; + } + } + elsif ((($count3b > 2 && ($count1b+$count2b+$count4b) eq '0') || ($differenceb3 eq "yes" && $differenceb2 eq "" && $differenceb1 eq "" && $differenceb4 eq "")) && ($dup ne "yes" || ($count2b+$count1b+$count4b) < $average_coverage_ext/$r)) + { + if (@extensions_before > 300000) + { + $ext_before = "yes"; + @extensions_before = @extensions_before3; + goto EXT_BEFORE; + } + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE7\n\n"; + } + delete $SNP_active{$id_split2}; + delete $before{$id_split2}; + delete $before_shorter_skip{$id_split2}; + $id = $id_split2; + if ($correction > 0) + { + delete $seed{$id_split2}; + $best_extension = $best_extension3; + goto AFTER_EXT; + } + else + { + $read_new = $seed{$id_split2}; + $read_new1 = $read_new; + goto BACK; + } + } + elsif ((($count4b > 2 && ($count1b+$count2b+$count3b) eq '0') || ($differenceb4 eq "yes" && $differenceb2 eq "" && $differenceb3 eq "" && $differenceb1 eq "")) && ($dup ne "yes" || ($count2b+$count3b+$count1b) < $average_coverage_ext/$r)) + { + if (@extensions_before > 300000) + { + $ext_before = "yes"; + @extensions_before = @extensions_before4; + goto EXT_BEFORE; + } + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE8\n\n"; + } + delete $SNP_active{$id_split3}; + delete $before{$id_split3}; + delete $before_shorter_skip{$id_split3}; + $id = $id_split3; + if ($correction > 0) + { + delete $seed{$id_split3}; + $best_extension = $best_extension4; + goto AFTER_EXT; + } + else + { + $read_new = $seed{$id_split3}; + $read_new1 = $read_new; + goto BACK; + } + } + elsif (($count1b+$count2b+$count3b+$count4b) < 10 && $overhang < $read_length-($overlap*1.5) && $before_shorter_skip ne "yes" && $skip_overhang ne "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nMAKE BEFORE SHORTER3\n\n"; + } + $before_shorter = "yes"; + goto BEFORE; + } + elsif(($count1b+$count2b+$count3b+$count4b) < 10 && $overhang < $read_length-($overlap*1.5) && $before_shorter_skip ne "yes" && $insert_range_shorter eq "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nMAKE BEFORE SHORTER3a\n\n"; + } + $insert_range_shorter = ""; + goto BEFORE; + } + elsif(($count1b+$count2b+$count3b+$count4b) > 9 && $before_shorter_skip ne "yes" && $insert_range_shorter eq "" && $type ne "mito_plant") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nSMALLER INSERT\n\n"; + } + $insert_range_shorter = "yes"; + goto BEFORE; + } + elsif (($count1b+$count2b+$count3b+$count4b) > 9 && $extensions_before ne "yes" && $repetitive_detect2 ne "yes" && $before_shorter_skip ne "yes") + { + $l = 0; + my $ll1 = '0'; + my $ll2 = '0'; + if ($count1b ne 0) + { + foreach my $ext1 (@extensions_before1) + { + if (length($ext1) > $ll1) + { + $ll1 = length($ext1); + } + } + $ll = $ll1; + } + if ($count2b ne 0) + { + foreach my $ext2 (@extensions_before2) + { + if (length($ext2) > $ll2) + { + $ll2 = length($ext2); + } + } + if ($ll2 < $ll1) + { + $ll = $ll2; + } + } + $best_extension = ""; + $SNP = ""; + @extensions = @extensions_before; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + $extensions_before = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "\nEXTENSIONS FROM BEFORE0\n\n"; + } + $read_new = $read; + $read_new1 = $read; + $best_extension_old1 = $best_extension1; + $best_extension_old2 = $best_extension2; + $best_extension_old3 = $best_extension3; + $best_extension_old4 = $best_extension4; + $SNP_active = "yes"; + delete $before{$id}; + $before_shorter_skip{$id} = "yes"; + $split_forward = ""; + delete $seed{$id}; + delete $seed{$id_original}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + undef @extensions_group1; + undef @extensions_group2; + undef @extensions_group3; + undef @extensions_group4; + goto NUCLEO; + } + } +EXT_BEFORE: if ($ext_before eq "yes") + { + $l = 0; + $best_extension = ""; + $SNP = ""; + @extensions = @extensions_before; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + $extensions_before = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "\nEXTENSIONS FROM BEFORE\n\n"; + } + $read_new = $read; + $read_new1 = $read; + $best_extension_old1 = $best_extension1; + $best_extension_old2 = $best_extension2; + $best_extension_old3 = $best_extension3; + $best_extension_old4 = $best_extension4; + $SNP_active = "yes"; + delete $before{$id}; + $before_shorter_skip{$id} = "yes"; + $split_forward = ""; + delete $seed{$id}; + delete $seed{$id_original}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + undef @extensions_group1; + undef @extensions_group2; + undef @extensions_group3; + undef @extensions_group4; + goto NUCLEO; + } +JUMP_REP: if ($repetitive_detect ne "" && ($jump_rep eq "yes" || $jump_rep_because_stuck eq "yes")) + { + my $mm2; + delete $jump_rep{$id}; + my $end_repetitive = substr $read, -$insert_size-150; + if (length($read) < 1000) + { + $end_repetitive = $read; + } + if ($y > $startprint2) + { + print OUTPUT5 $end_repetitive." END_REP\n"; + } + my $second_try = ""; + my %repetitive_pair_tmp = %repetitive_pair; + my %rep_pair; + my %rep_pair_exclude; +REP_PAIR0: undef %rep_pair; + undef %rep_pair_exclude; +REP_PAIR: foreach my $rep_pair (keys %repetitive_pair) + { + if ($y > $startprint2) + { + print OUTPUT5 $rep_pair." REP_PAIR_TEST\n"; + } + my $part1 = substr $rep_pair, 0, ($read_length/3)*2; + my $part2 = substr $rep_pair, ($read_length/3)*2-5; + my $r = '0'; + my $end_repetitive_tmp = $end_repetitive; + while ($r < length($part1)-15) + { + my $testit = substr $part1, $r, 15; + my $found8 = $end_repetitive_tmp =~ s/$testit/$testit/; + + if (($found8 > 0 || $second_try eq "yes2" || $second_try eq "yes3") && length($rep_pair) > 0.9*$read_length) + { + my $s = '0'; + while ($s < length($part2)-15) + { + my $testit2 = substr $part2, $s, 15; + my $found7 = $end_repetitive_tmp =~ s/$testit2/$testit2/; + my $found9 = $part1 =~ s/$testit2/$testit2/; + if ($found7 > 0 || $found9 > 0) + { + next REP_PAIR; + } + $s = $s+2; + } + $rep_pair{$rep_pair} = $repetitive_pair{$rep_pair}; + if ($y > $startprint2) + { + print OUTPUT5 $rep_pair." REP_PAIR_FOUND\n"; + } + next REP_PAIR; + } + $r = $r+5; + } + } + my $hg = '0'; + foreach (keys %rep_pair) + { + $hg++; + } + my %temp_rep; + undef %temp_rep; + my $search_rev; + if ($hg eq '0') + { +PAIR_OF_PAIR: foreach my $rep_pair (keys %repetitive_pair) + { + my $ft = '0'; + while ($ft < length($rep_pair) - $overlap) + { + my $part_rep = substr $rep_pair, $ft, $overlap; + if (exists($hash2b{$part_rep})) + { + my $search = $hash2b{$part_rep}; + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + $search_rev = $search_tmp."2"; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + $search_rev = $search_tmp."1"; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $found =~ tr/ACTG/TGAC/; + my $found_reverse = reverse($found); + delete $repetitive_pair{$rep_pair}; + $temp_rep{$found_reverse} = $search_rev; + next PAIR_OF_PAIR; + } + } + } + $ft++; + } + delete $repetitive_pair{$rep_pair}; + } + %repetitive_pair = %temp_rep; + my $mm = '0'; + foreach (keys %repetitive_pair) + { + $mm++; + } + if ($mm > 0) + { + if ($y > $startprint2) + { + print OUTPUT5 "\n" + } + if ($second_try eq "yes") + { + $second_try = "yes2"; + } + elsif ($second_try eq "") + { + $second_try = "yes"; + } + elsif ($second_try eq "yes2") + { + $second_try = "yes3"; + } + elsif ($second_try eq "yes3") + { + goto REP_PAIR1; + } + goto REP_PAIR0; + } + } +REP_PAIR1a: my $most_match_max = '0'; + my $id_rep = ""; + my $rep_pair2; +REP_PAIR1: foreach my $rep_pair (keys %rep_pair) + { + if (exists($rep_pair_exclude{$rep_pair})) + { + next REP_PAIR1; + } + my $part2 = substr $rep_pair, ($read_length/3)*2; + my $s = '0'; + my $most_match_total = '0'; + while ($s < length($part2)-12) + { + my $part2b = substr $rep_pair, $s, 12; + foreach my $rep_pair (keys %rep_pair) + { + my $check = $rep_pair =~ s/$part2b/$part2b/; + if ($check > 0) + { + $most_match_total++; + } + } + $s += 5; + } + if ($most_match_total > $most_match_max) + { + $most_match_max = $most_match_total; + $id_rep = $rep_pair{$rep_pair}; + while (exists($seed{$id_rep})) + { + $id_rep = "0".$id_rep; + } + foreach my $test (keys %contigs) + { + if ($test =~ m/.*\+$id_rep$/) + { + $id_rep = "0".$id_rep; + } + if ($test =~ m/.*\+0$id_rep$/) + { + $id_rep = "00".$id_rep; + } + } + $rep_pair2 = $rep_pair; + } + } + if ($id_rep ne "") + { + $noforward{$id} = "stop"; + $noforward = "stop"; + my $rep_pair2_tmp = $rep_pair2; + $rep_pair2 = correct ($rep_pair2, \%repetitive_pair); + if ($rep_pair2 eq "") + { + $rep_pair_exclude{$rep_pair2_tmp} = undef; + $id_rep = ""; + goto REP_PAIR1; + } + $seed{$id_rep} = $rep_pair2; + $seeds_check{$id_rep} = undef; + $nosecond{$id} = undef; + $nosecond = "yes"; + $insert_size2{$id_rep} = $insert_size; + $position{$id_rep} = length($rep_pair2); + $old_id{$id_rep} = $id; + $old_id2{$id_rep} = undef; + $old_rep{$id_rep} = undef; + $old_rep_old{$id} = undef; + $seed_old{$id} = $read; + + $read_new = $read; + $read_new1 = $read_new; + delete $seed{$id_split1}; + + if (exists($old_id{$id})) + { + $merge_now = "yes"; + goto MERGE; + } + print OUTPUT5 $id_rep." ID_REP\n"; + if ($noback eq "stop") + { + print OUTPUT5 ">".$id."\n"; + print OUTPUT5 $read."\n"; + delete $seed{$id}; + if (exists($seed_old{$id})) + { + $seed_old{$id} = $read; + } + if (!keys %seed) + { + $circle = "contigs"; + goto FINISH; + } + else + { + goto ITERATION; + } + } + else + { + print OUTPUT5 ">1\n"; + $noforward = "stop"; + $noforward{$id} = "stop"; + $seed_split{$id} = undef; + $best_extension = ""; + goto BACK; + } + } + if ($mm2 eq "") + { + $mm2 = '0'; + my %rep_check; + undef %rep_check; + +REP_CHECK0: foreach my $exts (keys %extensions_original) + { + my $search_tmp = substr $extensions_original{$exts}, 0, -1; + my $search_end = substr $extensions_original{$exts}, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + $found =~ tr/ACTG/TGAC/; + my $found2 = reverse($found); + if ($y > $startprint2) + { + print OUTPUT5 $found2." REP_PAIR_TEST_B\n"; + } + $rep_check{$found2} = $extensions_original{$exts}; + if ($y > $startprint2) + { + print OUTPUT5 $found2." REP_PAIR_FOUND_B\n"; + } + next REP_CHECK0; + } + } + + foreach (keys %rep_check) + { + $mm2++; + } + if ($mm2 > 0) + { + %repetitive_pair = %rep_check; + goto REP_PAIR0; + } + } + $noforward{$id} = "stop"; + } + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $before_shorter_skip{$id} = "yes"; + if ($before_shorter_skip eq "yes") + { + $before{$id} = "yes"; + } + $best_extension = ""; + $read_new = $read; + $read_new1 = $read_new; + + if ($count_split > 2) + { + my $tmp = '0'; + if ($first_yuyu ne "yes" || (($count1b_tmp+$count2b_tmp+$count3b_tmp+$count4b_tmp) > 4 && $count1b_tmp eq '0')) + { + $yuyu_option{$id.$firstSNP_max[$tmp]} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $firstSNP_max[$tmp]." FIRST_YUYU\n"; + } + } + $tmp++; + if ($second_yuyu ne "yes" || (($count1b_tmp+$count2b_tmp+$count3b_tmp+$count4b_tmp) > 4 && $count2b_tmp eq '0')) + { + $yuyu_option{$id.$firstSNP_max[$tmp]} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $firstSNP_max[$tmp]." SECOND_YUYU\n"; + } + } + $tmp++; + if ($third_yuyu ne "yes" || (($count1b_tmp+$count2b_tmp+$count3b_tmp+$count4b_tmp) > 4 && $count3b_tmp eq '0')) + { + $yuyu_option{$id.$firstSNP_max[$tmp]} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $firstSNP_max[$tmp]." THIRD_YUYU\n"; + } + } + $tmp++; + if ($fourth_yuyu ne "yes" || (($count1b_tmp+$count2b_tmp+$count3b_tmp+$count4b_tmp) > 4 && $count4b_tmp eq '0')) + { + $yuyu_option{$id.$firstSNP_max[$tmp]} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $firstSNP_max[$tmp]." FOURTH_YUYU\n"; + } + } + } + goto BACK; + } + } + my $split_no_rep = ""; + if ($jump_rep ne "yes" && $repetitive_detect eq "yes") + { + my $read_end9 = substr $read_short_end2, -$read_length; + my $ext1 = substr $best_extension1, 0, 9; + my $ext2 = substr $best_extension2, 0, 9; + my $check_ext1 = $read_end9 =~ s/(.)$ext1/$1$ext1/; + my $check_ext2 = $read_end9 =~ s/(.)$ext2/$1$ext2/; + if ($check_ext1 > 0 || $check_ext2 > 0) + { + $split_no_rep = "yes"; + } + } + if ($deletion ne "yes" && $reference_guided ne "yes" && $type ne "mito_plant" && $last_chance ne "yes" && $SNP_active eq "" && $contig_end ne "yes" && $indel_split_skip ne "yes" && ($delete_first ne "yes" || (length($best_extension1) < 5 && length($best_extension2) < 5)) && $ext_total > 18 && $indel_split < ($read_length-25 -$overlap)) + { + $best_extension = ""; + $indel_split{$id} = $indel_split+10; + if ($y > $startprint2) + { + print OUTPUT5 "INCREASE_INDEL_SPLIT\n"; + print OUTPUT5 $indel_split." INDEL_SPLIT\n"; + } + $read_new = $read; + $read_new1 = $read_new; + goto BACK; + } + elsif (($SNP_active eq "" || ($extensions_before eq "yes" && $deletion ne "yes" && $reference_guided ne "yes")) && $contig_end ne "yes" && ($delete_first ne "yes" || (length($best_extension1) < 5 && length($best_extension2) < 5)) && $deletion ne "yes" && $reference_guided ne "yes") + { + $best_extension = ""; + if ($y > $startprint2) + { + print OUTPUT5 "SNP_ACTIVE\n"; + } + $SNP_active{$id} = "yes"; + delete $indel_split{$id}; + + $read_new = $read; + $read_new1 = $read_new; + goto BACK; + } + elsif ($deletion eq "yes" || $reference_guided eq "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "DELETION_DETECT\n"; + } + delete $seed{$id_split1}; + delete $seed{$id}; + delete $before{$id}; + delete $before_shorter_skip{$id}; + $read_new = $read; + $read_new1 = $read_new; + } + elsif ($jump_rep ne "yes" && $repetitive_detect ne "" && ($split_no_rep eq "yes" || $repetitive_detect2 eq "yes")) + { + if ($y > $startprint2) + { + print OUTPUT5 "JUMP REP\n"; + } + delete $indel_split{$id}; + $jump_rep{$id} = "yes"; + $best_extension = ""; + $read_new = $read; + $read_new1 = $read_new; + goto BACK; + } + + + elsif($delete_first ne "yes" && $deletion ne "yes" && $reference_guided ne "yes" && $repetitive_detect eq "") + { + if (exists $old_id{$id} && ($noback eq "stop" || $position_back >= ($insert_size*3))) + { + $merge_read_length = length ($read); + $merge_read = "yes"; + $read = $seed_old{$old_id{$id}} ."LLLLLLLLLLLLLLL".$read; + $seed{$id} = $read; + $hasL = "yes"; + foreach my $tree_tmp (keys %tree) + { + my $old = $old_id{$id}; + my $tree2 = $tree{$tree_tmp}; + my $tree3 = $tree{$tree_tmp}; + if ($old_id{$id} =~ m/.*_(\d+)$/) + { + $old = $1; + } + my $id_tmp = $id; + if ($id =~ m/.*_(\d+)$/) + { + $id_tmp = $1; + } + my @ids_split = split /\*/, $tree2; + foreach my $id_split (@ids_split) + { + if ($id_split =~ m/^$old(B*REP)*$/) + { + if ($tree2 =~ m/^(.*\*)*$old(B*REP)*(\*.*)*$/) + { + if (defined($1)) + { + $tree3 = $1.$id_tmp; + } + else + { + $tree3 = $id_tmp; + } + if (defined($2)) + { + if ($2 eq "BREP") + { + $tree3 = $tree3."REPB"; + } + else + { + $tree3 = $tree3."REP"; + } + } + if (defined($3)) + { + $tree3 = $tree3.$3; + } + } + } + } + delete $tree{$tree_tmp}; + $tree{$tree_tmp} = $tree3; + foreach my $contigs_end (keys %contigs_end) + { + if ($contigs_end{$contigs_end} eq $old) + { + delete $contigs_end{$contigs_end}; + $contigs_end{$contigs_end} = $id_tmp; + } + } + } + delete $old_id{$id}; + $noback{$id} = "stop"; + if ($y > $startprint2) + { + print OUTPUT5 "Merged contigs with LLLLLLLLLLL!\n"; + } + $contig_gap_min{$id."_".$contig_count} = ($contig_gap_min{$id."_".$contig_count}-$position_back); + $contig_gap_max{$id."_".$contig_count} = ($contig_gap_max{$id."_".$contig_count}-$position_back); + } + $indel_split = '0'; + delete $indel_split{$id}; + $best_extension = ""; + + delete $seed{$id_split1}; + delete $seed{$id}; + $seed{$id} = $read; + $seeds_check{$id} = undef; + $read_new = $read; + $read_new1 = $read; + + my $best_extension1_tmpi = $best_extension1; + my $best_extension2_tmpi = $best_extension2; + $best_extension1_tmpi =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//d; + $best_extension2_tmpi =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//d; + if ($SNR_read eq "yes" && length($best_extension1_tmpi) < 7 && length($best_extension2_tmpi) < 7) + { + $SNR_next_seed = "yes"; + $noforward{$id} = "stop"; + delete $seed{$id_split2}; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split3}; + $noforward = "stop"; + if ($y > $startprint2) + { + print OUTPUT5 "SNR_NEXT_SEED\n"; + } + goto BACK; + } + if ($y > $startprint2) + { + print OUTPUT5 ">".$id."\n"; + print OUTPUT5 $read."\n\n\n"; + } + my $best_ex1 = substr $best_extension1, 0, 7; + my $best_ex2 = substr $best_extension2, 0, 7; + my $best_ex3 = substr $best_extension3, 0, 7; + my $best_ex4 = substr $best_extension4, 0, 7; + $best_ex1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $best_ex2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $best_ex3 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $best_ex4 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $contigs_endi = substr $read_end, -10; + my $read_short_end2_tmp = $read_short_end2; + my $check_rep1 = $read_short_end2_tmp =~ s/$contigs_end0$best_ex1/$contigs_end0$best_ex1/g; + my $check_rep2 = $read_short_end2_tmp =~ s/$contigs_end0$best_ex2/$contigs_end0$best_ex2/g; + my $check_rep3 = $read_short_end2_tmp =~ s/$contigs_end0$best_ex3/$contigs_end0$best_ex3/g; + my $check_rep4 = $read_short_end2_tmp =~ s/$contigs_end0$best_ex4/$contigs_end0$best_ex4/g; + +CORRECT: my $contig_id2_tmp = substr $contig_id2, 0,-1; + + my $contigs_end1 = substr $best_extension1, 0, 7; + my $contigs_end2 = substr $best_extension2, 0, 7; + my $contigs_end3 = substr $best_extension3, 0, 7; + my $contigs_end4 = substr $best_extension4, 0, 7; + $contigs_end1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $contigs_end2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $contigs_end3 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $contigs_end4 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $contigs_end0 = substr $read_end, -15; + my $tree_empty = ""; + + if (exists($contigs_id{$contig_id2}) || exists($contigs_end{$contigs_end0.$contigs_end2})) + { + $tree{$id} = $contigs_end{$contigs_end0.$contigs_end2}; + $tree_empty = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "TREE_EMPTY2\n"; + } + } + else + { + $seed{$contig_id2} = $contig_read2; + $seeds_check{$contig_id2} = undef; + $insert_size2{$contig_id2} = $insert_size; + $position{$contig_id2} = length($contig_read2); + $old_id2{$contig_id2} = undef; + $noback{$contig_id2} = "stop"; + + $contigs_id{$contig_id2} = undef; + $contigs_end{$contigs_end0.$contigs_end2} = $contig_id2; + $correct_after_split = "yes"; + $tree{$id} = $contig_id2; + + if ($y > $startprint2) + { + print OUTPUT5 $contig_id2." CONTIG_ID2\n"; + print OUTPUT5 $best_extension2." best_ext2\n"; + my $contig_id2hhh = substr $contig_id2, 0 ,-1; + if (exists($hash{$contig_id2hhh})) + { + print OUTPUT5 $hash{$contig_id2hhh}." HASH2\n"; + } + } + if (length($read) < 251) + { + delete $seed{$contig_id2}; + $seed{$contig_id2} = $read.$best_extension2; + $seeds_check{$contig_id2} = undef; + delete $tree{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "SHORT2\n"; + } + foreach my $tree_tmp (keys %tree) + { + my $tmp = $tree{$tree_tmp}; + my @ids_split = split /\*/, $tmp; + foreach my $id_split (@ids_split) + { + if ($id_split eq $id) + { + delete $tree{$tree_tmp}; + if ($tmp =~ m/^(.*\*)*$id(\*.*)*$/) + { + if (defined($1)) + { + $tmp = $1.$contig_id2; + } + else + { + $tmp = $contig_id2; + } + if (defined($2)) + { + $tmp = $tmp.$2; + } + if ($y > $startprint2) + { + print OUTPUT5 $tmp." TESTOOO5\n"; + } + $tree{$tree_tmp} = $tmp; + foreach my $end_tmp (keys %contigs_end) + { + if ($contigs_end{$end_tmp} eq $id) + { + delete $contigs_end{$end_tmp}; + $contigs_end{$end_tmp} = $contig_id2; + } + } + } + } + } + } + } + } + if (exists($contigs_id{$contig_id1}) || exists($contigs_end{$contigs_end0.$contigs_end1})) + { + if ($tree_empty eq "yes") + { + $tree{$id} = $contigs_end{$contigs_end0.$contigs_end2}."*".$contigs_end{$contigs_end0.$contigs_end1}; + } + else + { + $tree{$id} = $contig_id2."*".$contigs_end{$contigs_end0.$contigs_end1}; + } + } + else + { + + $seed{$contig_id1} = $contig_read1; + $seeds_check{$contig_id1} = undef; + $insert_size2{$contig_id1} = $insert_size; + $position{$contig_id1} = length($contig_read1); + $old_id2{$contig_id1} = undef; + $noback{$contig_id1} = "stop"; + + $contigs_id{$contig_id1} = undef; + $contigs_end{$contigs_end0.$contigs_end1} = $contig_id1; + $correct_after_split = "yes"; + + if ($tree_empty eq "yes") + { + $tree{$id} = $contigs_end{$contigs_end0.$contigs_end2}."*".$contig_id1; + } + else + { + $tree{$id} = $contig_id2."*".$contig_id1; + } + + if ($y > $startprint2) + { + print OUTPUT5 $contig_id1." CONTIG_ID1\n"; + print OUTPUT5 $best_extension1." best_ext1\n"; + my $contig_id1hhh = substr $contig_id1, 0 ,-1; + if (exists($hash{$contig_id1hhh})) + { + print OUTPUT5 $hash{$contig_id1hhh}." HASH1\n"; + } + } + if (length($read) < 251) + { + delete $seed{$contig_id1}; + $seed{$contig_id1} = $read.$best_extension1; + $seeds_check{$contig_id1} = undef; + delete $tree{$id}; + foreach my $tree_tmp (keys %tree) + { + my $tmp = $tree{$tree_tmp}; + my @ids_split = split /\*/, $tmp; + foreach my $id_split (@ids_split) + { + if ($tree_empty eq "yes") + { + if ($id_split eq $id) + { + delete $tree{$tree_tmp}; + if ($tmp =~ m/^(.*\*)*$id(\*.*)*$/) + { + if (defined($1)) + { + $tmp = $1.$contig_id1; + } + else + { + $tmp = $contig_id1; + } + if (defined($2)) + { + $tmp = $tmp.$2; + } + $tree{$tree_tmp} = $contigs_end{$contigs_end0.$contigs_end2}."*".$tmp; + if ($y > $startprint2) + { + print OUTPUT5 $tree{$tree_tmp}." TESTOOO4a-----------------\n"; + } + foreach my $end_tmp (keys %contigs_end) + { + if ($contigs_end{$end_tmp} eq $id) + { + delete $contigs_end{$end_tmp}; + $contigs_end{$end_tmp} = $contig_id1; + } + } + } + } + } + else + { + if ($id_split eq $contig_id2) + { + delete $tree{$tree_tmp}; + $tree{$tree_tmp} = $contig_id1."*".$tmp; + if ($y > $startprint2) + { + print OUTPUT5 $tree{$tree_tmp}." TESTOOO5------------\n"; + } + foreach my $end_tmp (keys %contigs_end) + { + if ($contigs_end{$end_tmp} eq $contig_id2) + { + delete $contigs_end{$end_tmp}; + $contigs_end{$end_tmp} = $contig_id2."*".$contig_id1; + } + } + } + } + } + } + } + } + if ($count_split > 2) + { + if (exists($contigs_id{$contig_id3}) || exists($contigs_end{$contigs_end0.$contigs_end3})) + { + $tree{$id} = $contigs_end{$contigs_end0.$contigs_end3}; + $tree_empty = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "TREE_EMPTY3\n"; + } + } + else + { + $seed{$contig_id3} = $contig_read3; + $seeds_check{$contig_id3} = undef; + $insert_size2{$contig_id3} = $insert_size; + $position{$contig_id3} = length($contig_read3); + $old_id2{$contig_id3} = undef; + $noback{$contig_id3} = "stop"; + + $contigs_id{$contig_id3} = undef; + $contigs_end{$contigs_end0.$contigs_end3} = $contig_id3; + $correct_after_split = "yes"; + $tree{$id} = $contig_id3; + + if ($y > $startprint2) + { + print OUTPUT5 $contig_id3." CONTIG_ID3\n"; + print OUTPUT5 $best_extension3." best_ext3\n"; + my $contig_id3hhh = substr $contig_id3, 0 ,-1; + if (exists($hash{$contig_id3hhh})) + { + print OUTPUT5 $hash{$contig_id3hhh}." HASH3\n"; + } + } + if (length($read) < 251) + { + delete $seed{$contig_id3}; + $seeds_check{$contig_id3} = undef; + $seed{$contig_id3} = $read.$best_extension3; + delete $tree{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "SHORT2\n"; + } + foreach my $tree_tmp (keys %tree) + { + my $tmp = $tree{$tree_tmp}; + my @ids_split = split /\*/, $tmp; + foreach my $id_split (@ids_split) + { + if ($id_split eq $id) + { + delete $tree{$tree_tmp}; + if ($tmp =~ m/^(.*\*)*$id(\*.*)*$/) + { + if (defined($1)) + { + $tmp = $1.$contig_id3; + } + else + { + $tmp = $contig_id3; + } + if (defined($2)) + { + $tmp = $tmp.$2; + } + if ($y > $startprint2) + { + print OUTPUT5 $tmp." TESTOOO5\n"; + } + $tree{$tree_tmp} = $tmp; + foreach my $end_tmp (keys %contigs_end) + { + if ($contigs_end{$end_tmp} eq $id) + { + delete $contigs_end{$end_tmp}; + $contigs_end{$end_tmp} = $contig_id3; + } + } + } + } + } + } + } + } + } + if ($count_split > 3) + { + if (exists($contigs_id{$contig_id4}) || exists($contigs_end{$contigs_end0.$contigs_end4})) + { + $tree{$id} = $contigs_end{$contigs_end0.$contigs_end4}; + $tree_empty = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "TREE_EMPTY4\n"; + } + } + else + { + $seed{$contig_id4} = $contig_read4; + $seeds_check{$contig_id4} = undef; + $insert_size2{$contig_id4} = $insert_size; + $position{$contig_id4} = length($contig_read4); + $old_id2{$contig_id4} = undef; + $noback{$contig_id4} = "stop"; + + $contigs_id{$contig_id4} = undef; + $contigs_end{$contigs_end0.$contigs_end4} = $contig_id4; + $correct_after_split = "yes"; + $tree{$id} = $contig_id4; + + if ($y > $startprint2) + { + print OUTPUT5 $contig_id4." CONTIG_ID4\n"; + print OUTPUT5 $best_extension4." best_ext4\n"; + my $contig_id4hhh = substr $contig_id4, 0 ,-1; + if (exists($hash{$contig_id4hhh})) + { + print OUTPUT5 $hash{$contig_id4hhh}." HASH4\n"; + } + } + if (length($read) < 251) + { + delete $seed{$contig_id4}; + $seed{$contig_id4} = $read.$best_extension4; + $seeds_check{$contig_id4} = undef; + delete $tree{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "SHORT4\n"; + } + foreach my $tree_tmp (keys %tree) + { + my $tmp = $tree{$tree_tmp}; + my @ids_split = split /\*/, $tmp; + foreach my $id_split (@ids_split) + { + if ($id_split eq $id) + { + delete $tree{$tree_tmp}; + if ($tmp =~ m/^(.*\*)*$id(\*.*)*$/) + { + if (defined($1)) + { + $tmp = $1.$contig_id4; + } + else + { + $tmp = $contig_id4; + } + if (defined($2)) + { + $tmp = $tmp.$2 + } + if ($y > $startprint2) + { + print OUTPUT5 $tmp." TESTOOO5\n"; + } + $tree{$tree_tmp} = $tmp; + foreach my $end_tmp (keys %contigs_end) + { + if ($contigs_end{$end_tmp} eq $id) + { + delete $contigs_end{$end_tmp}; + $contigs_end{$end_tmp} = $contig_id4; + } + } + } + } + } + } + } + } + } + if ($noback eq "stop") + { + if (exists($old_id{$id})) + { + my $read_tmp = $seed_old{$old_id{$id}}; + if (length($read_tmp) > 250) + { + if ($contig_num eq '1') + { + $contigs{$contig_num."+".$id} = $read_tmp; + if ($y > $startprint2) + { + print OUTPUT5 $contig_num."+".$id." ADD_CONTIG10\n"; + } + my $start_point = '500'; + my $check_repetitive = '3'; + + while ($check_repetitive > 2) + { + my $repetitive = substr $read_tmp, $start_point, 15; + $repetitive =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + my $read_short_area = substr $read_tmp, $start_point -170, 340; + $check_repetitive = $read_short_area =~ s/$repetitive/$repetitive/g; + if ($check_repetitive > 2) + { + $start_point += 20; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_IN_START_SEQUENCE\n"; + } + } + } + $first_contig_start = substr $read_tmp, $start_point, $overlap; + my $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 10; + $first_contig_start = substr $read_tmp, $start_point, $overlap; + $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + if ($y > $startprint2) + { + print OUTPUT5 $first_contig_start." CONTIG_START3\n"; + } + foreach my $seedie (keys %seed) + { + my $seedie_part = substr $seed{$seedie}, 0, 1000; + my $check_seedie = $seedie_part =~ s/$first_contig_start/$first_contig_start/; + if ($check_seedie > 0) + { + delete $seed{$seedie}; + } + } + } + else + { + $contigs{$contig_num."+".$old_id{$id}} = $read_tmp; + if ($y > $startprint2) + { + print OUTPUT5 $contig_num."+".$id." ADD_CONTIG11\n"; + } + } + $contig_num++; + } + } + if (length($read) > 250) + { + if ($contig_num eq '1') + { + $contigs{$contig_num."+".$id} = $read; + if ($y > $startprint2) + { + print OUTPUT5 $contig_num."+".$id." ADD_CONTIG12\n"; + } + my $start_point = '500'; + my $check_repetitive = '3'; + + while ($check_repetitive > 2) + { + my $repetitive = substr $read, $start_point, 15; + $repetitive =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + my $read_short_area = substr $read, $start_point -170, 340; + $check_repetitive = $read_short_area =~ s/$repetitive/$repetitive/g; + if ($check_repetitive > 2) + { + $start_point += 20; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_IN_START_SEQUENCE\n"; + } + } + } + $first_contig_start = substr $read, $start_point, $overlap; + my $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 10; + $first_contig_start = substr $read, $start_point, $overlap; + $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + if ($y > $startprint2) + { + print OUTPUT5 $first_contig_start." CONTIG_START4\n"; + } + foreach my $seedie (keys %seed) + { + my $seedie_part = substr $seed{$seedie}, 0, 1000; + my $check_seedie = $seedie_part =~ s/$first_contig_start/$first_contig_start/; + if ($check_seedie > 0) + { + delete $seed{$seedie}; + } + } + } + else + { + $contigs{$contig_num."+".$id} = $read; + if ($y > $startprint2) + { + print OUTPUT5 $contig_num."+".$id." ADD_CONTIG13\n"; + } + } + $contig_num++; + } + delete $seed{$id}; + if (!keys %seed) + { + $circle = "contigs"; + goto FINISH; + } + else + { + goto ITERATION; + } + } + else + { + $noforward = "stop"; + $noforward{$id} = "stop"; + $seed_split{$id} = undef; + $best_extension = ""; + goto BACK; + } + } + else + { + $indel_split = '0'; + delete $indel_split{$id}; + } + } + else + { + if ($best_extension ne "") + { + $indel_split = '0'; + delete $indel_split{$id}; + } + + + my $best_extension_no_dot = $best_extension; + my @ext = split //, $best_extension; + + my $u = length($best_extension); + my $v = '1'; + if (($SNR_test eq "yes2" || $SNR_test eq "yes2_double") && $u > 2) + { + my @ext3 = split //, $best_extension; + $v = '1'; + $u = length($best_extension); + while ($ext3[$u-$v-1] eq $ext3[$u-$v] && ($u-$v-1) > 1) + { + chop($best_extension); + $v++; + } + } + if (($SNR_test eq "yes222222" || $SNR_test eq "yes2_double") && $u > 2) + { + my @ext3 = split //, $best_extension; + $v = '1'; + $u = length($best_extension); + while ($ext3[$u-$v-1] eq $ext3[$u-$v] && ($u-$v-1) > 1) + { + chop($best_extension); + $v++; + } + my $SNR_max = '0'; + my $SNR_min = '1000'; + my $n = '0'; + my $extensions_after_SNR = substr $best_extension, 0,4; + + foreach my $SNR_ext (keys %SNR_count) + { + my $SNR_count = $SNR_count{$SNR_ext}; + my $checkie = substr $SNR_ext, 0, 4; + if ($SNR_count ne "" && $checkie eq $extensions_after_SNR) + { + if ($SNR_count > $SNR_max) + { + $SNR_max = $SNR_count; + } + if ($SNR_count < $SNR_min) + { + $SNR_min = $SNR_count; + } + } + } + if ($SNR_min eq '1000') + { + $SNR_min = '0'; + } + my $p = '0'; + my $ut = '0'; + + if ($SNR_test eq "yes2" && $SNR_max > 0) + { + $best_extension = $SNR_nucleo.$best_extension; + $ut = '1'; + while ($p < $SNR_max - $SNR_min) + { + $best_extension = "X".$best_extension; + $p++; + } + } + elsif ($SNR_test eq "yes2_double" && $SNR_max > 0) + { + $best_extension = $SNR_nucleo.$best_extension; + $ut = '2'; + while ($p < ($SNR_max - $SNR_min)/2) + { + $best_extension = "X2".$best_extension; + $p++; + } + } + $p = '0'; + + while ($p+$ut < $SNR_min) + { + $best_extension = $SNR_nucleo.$best_extension; + if ($SNR_test eq "yes2") + { + $p++; + } + if ($SNR_test eq "yes2_double") + { + $p++; + $p++; + } + } + delete $SNR{$id}; + } + elsif ($SNR_test eq "yes2222222" || $SNR_test eq "yes2_double") + { + my $ee2 = '10000'; + foreach (keys %SNR_count) + { + my $ee = $SNR_count{$_}; + if ($ee < $ee2) + { + $ee2 = $ee; + } + } + $best_extension = ""; + my $ee3 = '0'; + if ($ee2 ne '10000') + { + while ($ee3 < $ee2) + { + $best_extension = $SNR_nucleo.$best_extension; + $ee3++; + } + } + else + { + $best_extension = ""; + } + } + $SNR_test = ""; + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSION\n\n"; + } + $best_extension_prev{$id} = $best_extension; + } +AFTER_EXT: + + chomp $best_extension; + my $vk2 = '0'; + if ($SNR_read eq "") + { + my $best_extension_tmp7 = $best_extension; + $best_extension_tmp7 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + my @dot2 = split //, $best_extension_tmp7; + my $ut2 = length($best_extension_tmp7); + + while ($dot2[$ut2-1] eq "." || $dot2[$ut2-1] eq "*") + { + if ($dot2[$ut2-1] eq "*") + { + chop $best_extension; + chop $best_extension; + $vk2++; + $vk2++; + $ut2--; + $ut2--; + } + else + { + chop $best_extension; + $vk2++; + $ut2--; + } + } + } + my $best_extension_tmpp = substr $best_extension, -15; + my $SNR_check = $best_extension_tmpp =~ qr/AAAAAAA|CCCCCCC|GGGGGGG|TTTTTTT/; + if ($SNR_check > 0 && $variance_detection eq "" && $heteroplasmy eq "") + { + if ($best_extension =~ m/(.*?(AAAAAA|CCCCCC|GGGGGG|TTTTTT))(.*)/) + { + my $best_extension_short = $1; + if (length($3) < length($best_extension_short)+5) + { + $best_extension = $best_extension_short; + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSION_SHORT\n\n"; + } + } + } + } + + if ($best_extension ne "" && $best_extension ne " ") + { + $read_new = $read.$best_extension; + $read_new1 = $read_new; + + $position += length($best_extension); + $position -= $vk2; + + delete $position{$id}; + $position{$id} = $position; + + if ($best_extension ne "" && $split_forward eq "") + { + delete $indel_split_skip{$id}; + delete $before{$id}; + delete $before_shorter_skip{$id}; + } + if (exists($last_ref_seq_forward{$id}) && $split eq "") + { + my $seq_tmp = $last_ref_seq_forward{$id}; + $last_ref_seq_forward{$id} = $seq_tmp.$best_extension; + } + if (exists($large_variance_forward{$id}) && $split eq "") + { + $large_variance_length_forward{$id} = $large_variance_length_forward{$id}+length($best_extension); + } + $best_extension_forward = $best_extension; + $best_extension = ""; + delete $regex{$id}; + if ($split eq "") + { + delete $yuyu_option{$id.'A'}; + delete $yuyu_option{$id.'C'}; + delete $yuyu_option{$id.'T'}; + delete $yuyu_option{$id.'G'}; + delete $before_shorter_skip{$id}; + } + if ($SNR_read ne "" && $last_chance eq "yes") + { + delete $last_chance{$id}; + $last_chance{$id} = "yes"; + } + else + { + delete $last_chance{$id}; + } + if ($split eq "" && $jump_rep_because_stuck ne "") + { + delete $jump_rep_because_stuck{$id}; + $count_stuck_in_rep++; + } + $id_test = $id; + if ($split eq "") + { + foreach my $add_read2 (keys %extensions) + { + my $add_read = substr $extensions{$add_read2}, 0, -1; + $count_reads{$add_read} = undef; + } + } + } + elsif ($use_regex ne "yes" && $indel_split > 0) + { + delete $regex{$id}; + if ($split_forward eq "") + { + $regex{$id} = "yes"; + } + elsif ($split_forward ne "" && exists($SNP_active{$id})) + { + $indel_split_skip{$id} = "yes"; + $indel_split = '0'; + delete $indel_split{$id}; + } + elsif ($split_forward ne "") + { + $SNP_active{$id} = "yes"; + } + $read_new = $read; + if ($y > $startprint2) + { + print OUTPUT5 "OPTION1\n"; + } + } + elsif ($use_regex ne "" && $indel_split > 0) + { + $indel_split_skip{$id} = "yes"; + $indel_split = '0'; + delete $indel_split{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "OPTION1b\n"; + } + } + elsif ($use_regex ne "yes" && $repetitive_detect eq "") + { + delete $regex{$id}; + $regex{$id} = "yes"; + $read_new = $read; + if ($y > $startprint2) + { + print OUTPUT5 "OPTION2\n"; + } + } + elsif ($use_regex ne "yes" && $last_chance ne "yes" && $indel_split > 0) + { + $read_new = $read; + $indel_split_skip{$id} = "yes"; + $indel_split = '0'; + delete $indel_split{$id}; + $SNP_active{$id} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "OPTION3\n"; + } + } + elsif (($use_regex ne "" || $repetitive_detect ne "") && $last_chance ne "yes" && ($AT_rich_before eq "" || $mmr < 15)) + { + $read_new = $read; + delete $last_chance{$id}; + $last_chance{$id} = "yes"; + delete $regex{$id}; + $use_regex = ""; + if ($y > $startprint2) + { + print OUTPUT5 "OPTION4\n"; + } + } + elsif ($last_chance eq "yes" && $use_regex ne "yes" && $repetitive_detect eq "") + { + $read_new = $read; + delete $last_chance{$id}; + $last_chance{$id} = "yes"; + delete $regex{$id}; + $regex{$id} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "OPTION5\n"; + } + } + elsif (($last_chance eq "yes" || $AT_rich_before ne "") && $use_regex ne "") + { + $read_new = $read; + delete $last_chance{$id}; + $noforward = "stop"; + $noforward{$id} = $noforward; + if ($y > $startprint2) + { + print OUTPUT5 "OPTION6\n"; + } + } + else + { + $noforward = "stop"; + $noforward{$id} = $noforward; + if ($y > $startprint2) + { + print OUTPUT5 "OPTION7\n"; + } + } + if ($split eq "yes2") + { + $seed{$id} = $read_new; + $insert_size2{$id} = $insert_size; + + if ($count_split eq '2') + { + $split = "yes4"; + } + goto SPLIT; + } + elsif ($split eq "yes3") + { + $seed{$id} = $read_new; + $insert_size2{$id} = $insert_size; + + if ($count_split eq '3') + { + $split = "yes4"; + } + goto SPLIT; + } + if ($split eq "yes4") + { + $seed{$id} = $read_new; + $insert_size2{$id} = $insert_size; + + goto SPLIT; + } +BACK: if ($y > $startprint2 && $benchmark_time eq "yes") + { + $time_back = time; + if ($time_back - $time_collect_ext > 1) + { + print OUTPUT5 $time_back - $time_collect_ext." TIME_BACK\n"; + } + } + if ($ref_skip_before eq "yes") + { + if (exists($last_ref_seq_forward{$id}) && $split eq "") + { + my $seq_tmp = $last_ref_seq_forward{$id}; + $last_ref_seq_forward{$id} = $seq_tmp.$best_extension; + } + if (exists($large_variance_forward{$id}) && $split eq "") + { + $large_variance_length_forward{$id} = $large_variance_length_forward{$id}+length($best_extension); + } + } + if (keys %merged_match_back eq 0 && $use_regex_back ne "yes" && $noback eq "" && $circle eq "" ) + { + delete $regex_back{$id}; + $regex_back{$id} = "yes"; + $best_extension_back_prev{$id} = ""; + } + elsif (keys %merged_match_back eq 0 && $use_regex_back eq "yes") + { + $noback{$id} = "stop"; + $best_extension_back_prev{$id} = ""; + } + if (keys %merged_match_back > 0 && $noback ne "stop" ) + { + undef %extensions1; + undef %extensions2; + undef %extensions; + undef %extensions1b; + undef %extensions2b; + undef %extensionsb; + undef @extensions1; + undef @extensions2; + undef @extensions; + undef %extensions1b; + undef %extensions2b; + undef %extensionsb; + undef @matches; + undef @matches1; + undef @matches2; + undef %SNR_length; + undef %filter_before1; + undef %filter_before2; + undef %filter_before3; + undef %filter_before4; + undef %filter_before1_pair; + undef %filter_before2_pair; + undef %filter_before3_pair; + undef %filter_before4_pair; + undef %remove_extension_mismatch; + $split_forward = ""; + $split = ""; + $extensions_before = ""; + $ext_before = ""; + $insert_range_shorter = ""; + $best_extension_old1 = ""; + $best_extension_old2 = ""; + $best_extension_old3 = ""; + $best_extension_old4 = ""; + +REGEX_BACK: + $read_count = '0'; + $read_ex = '0'; + $read2_ex = '0'; + + if ($SNR_read_back ne "") + { + } + my $X4 = $read_short_start =~ tr/\*/\*/; + + if ($X4 > 0) + { + %read_short_start_tmp = build_partial3c ($read_short_start, "reverse_back"); + } + else + { + $read_short_start_tmp{$read_short_start} = undef; + } + + if ($y > $startprint2) + { + if ($use_regex_back eq "yes") + { + print OUTPUT5 "USE_REGEX_BACK_REVERSE\n"; + } + } + + my $extra_overlap = ""; + my $read_short_start_tempie = ""; + my $test_dot = '0'; + my $test_star = '0'; + my %hash_read_short_start; + my %hash_read_short_start_dot; + my $F; + if ($last_chance_back ne "yes") + { + $extra_overlap = sprintf("%.0f", (($read_length-$overlap)/3)); + + $F = ($insert_size-(($insert_size*$insert_range_back)-$insert_size))-$read_length+($read_length-$overlap-$extra_overlap-15)-($read_length-$left-$right-$overlap); + if ($F < 0) + { + $F = '0'; + } + $read_short_start_tempie = substr $read, $F , ($insert_size*$insert_range_back)-$F; + $read_short_start_tempie =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + undef %read_short_start_tmp; + $test_dot = $read_short_start_tempie =~ tr/\./\./; + $test_star = $read_short_start_tempie =~ tr/\*/\*/; + + if ($test_dot > 3 || $mmbr < 20) + { + %read_short_start_tmp = build_partial3c $read_short_start_tempie; + } + else + { + %read_short_start_tmp = build_partial3b $read_short_start_tempie; + } + if ($mmbr > 19) + { + my $ff = '0'; + undef %hash_read_short_start; + foreach my $read_short_start_tempie (keys %read_short_start_tmp) + { + $ff = '0'; + if (length($read) < $insert_size*$insert_range && $hp_seed_assemble ne "") + { + $extra_overlap = '0'; + } + while ($ff < (length($read_short_start_tempie)-($overlap+$extra_overlap)-1)) + { + my $read_short_start_part = substr $read_short_start_tempie, $ff, $overlap+$extra_overlap; + if ($test_dot < 6 && $test_dot > 3) + { + my $test_dot2 = $read_short_start_part =~ tr/\./\./; + if ($test_dot2 > 0) + { + $hash_read_short_start_dot{$read_short_start_part} = $ff; + } + else + { + $hash_read_short_start{$read_short_start_part} = $ff; + } + } + else + { + $hash_read_short_start{$read_short_start_part} = $ff; + } + $ff++; + } + } + } + } +NO_MATCH_BACK: foreach my $ln (keys %merged_match_back) + { + $match = $merged_match_back{$ln}; + $id_match = $ln; + chomp $id_match; + chomp $match; + + if ($hp_seed_assemble ne "" && $last_chance_back eq "yes") + { + my $check = ""; + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp >= -$position_back && $pos_tmp < (-$position_back+length($match))) + { + $check = "yes"; + } + } + if ($check ne "yes") + { + next NO_MATCH_BACK; + } + } + if ($last_chance_back eq "yes") + { + my $forward = ""; + if (exists($merged_match_back1{$ln})) + { + my $test = substr $match, -$merged_match_back_pos{$ln}-$right-$overlap, $overlap; + + my $last_nuc = substr $match, -$merged_match_back_pos{$ln}-$right-$overlap, 1; + if ($heteroplasmy ne "" && ($last_nuc eq "1" || $last_nuc eq "2" || $last_nuc eq "3" || $last_nuc eq "4" || $last_nuc eq "N") && $SNR_read_back eq "") + { + next NO_MATCH_BACK; + } + if ($use_quality ne "") + { + $test =~ tr/1234/ACTG/; + my $countN = $match =~ tr/1234//; + if ($countN > length($match)*0.35) + { + next NO_MATCH_BACK; + } + } + + foreach my $read_start_tmp (keys %read_start) + { + if ($test eq $read_start_tmp) + { + my $extension_tmp = $match; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$right-$overlap, $merged_match_back_pos{$ln}+$right+$overlap, ""; + $extension = reverse($extension_tmp); + $forward = "yes"; + + $read_count++; + goto LAST1_BACK; + } + } + my $test_N1 = $read_start =~ tr/N/\./; + my $test_N2 = $test =~ tr/N/\./; + + if ($test_N1 > 0 && $test_N2 eq '0' && $test_N1 < $overlap/6) + { + foreach my $read_start_tmp (keys %read_start) + { + if ($test =~ m/$read_start_tmp/) + { + my $extension_tmp = $match; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$right-$overlap, $merged_match_back_pos{$ln}+$right+$overlap, ""; + $extension = reverse($extension_tmp); + $forward = "yes"; + + $read_count++; + goto LAST1_BACK; + } + } + } + elsif ($test_N2 > 0 && $test_N1 eq '0' && $test_N2 < $overlap/6) + { + foreach my $read_start_tmp (keys %read_start) + { + if ($read_start_tmp =~ m/$test/) + { + my $extension_tmp = $match; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$right-$overlap, $merged_match_back_pos{$ln}+$right+$overlap, ""; + $extension = reverse($extension_tmp); + $forward = "yes"; + + $read_count++; + goto LAST1_BACK; + } + } + } + foreach my $read_start_tmp (keys %read_start) + { + my @test_tmp = split //, $test; + my @read_start_tmp = split //, $read_start_tmp; + + my $d = '0'; + my $next = ""; + my $N = '0'; + + while ($d < length($read_start_tmp)) + { + if ($test_tmp[$d] eq $read_start_tmp[$d]) + { + } + elsif ($read_start_tmp[$d] eq ".") + { + } + elsif ($test_tmp[$d] eq "." && $N < '5') + { + $N++; + } + elsif ($next eq "" && $hp_seed_assemble eq "" && length($read) > $insert_size+200) + { + $next = "yes"; + } + elsif ($next eq "yes") + { + $next = "yes2"; + } + elsif ($next eq "yes2") + { + $next = "yes3"; + } + else + { + next NO_MATCH_BACK; + } + $d++ + } + + my $extension_tmp = $match; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$right-$overlap, $merged_match_back_pos{$ln}+$right+$overlap, ""; + $extension = reverse($extension_tmp); + $forward = "yes"; + + $read_count++; + goto LAST1_BACK; + } + } + elsif (exists($merged_match_back2{$ln})) + { + my $match_reverse = reverse($match); + $match_reverse =~ tr/ACTG/TGAC/; + my $test = substr $match_reverse, -$merged_match_back_pos{$ln}-$left-$overlap, $overlap; + + if ($use_quality ne "") + { + $test =~ tr/1234/TGAC/; + my $countN = $match =~ tr/1234//; + if ($countN > length($match)*0.35) + { + next NO_MATCH_BACK; + } + } + + foreach my $read_start_tmp (keys %read_start) + { + if ($test eq $read_start_tmp) + { + my $extension_tmp = $match_reverse; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$left-$overlap, $merged_match_back_pos{$ln}+$left+$overlap, ""; + $extension = reverse($extension_tmp); + + $read_count++; + goto LAST1_BACK; + } + } + my $test_N1 = $read_start =~ tr/N/\./; + my $test_N2 = $test =~ tr/N/\./; + + if ($test_N1 > 0 && $test_N2 eq '0' && $test_N1 < $overlap/6) + { + foreach my $read_start_tmp (keys %read_start) + { + if ($test =~ m/$read_start_tmp/) + { + my $extension_tmp = $match_reverse; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$left-$overlap, $merged_match_back_pos{$ln}+$left+$overlap, ""; + $extension = reverse($extension_tmp); + + $read_count++; + goto LAST1_BACK; + } + } + } + elsif ($test_N2 > 0 && $test_N1 eq '0' && $test_N2 < $overlap/6) + { + foreach my $read_start_tmp (keys %read_start) + { + if ($test =~ m/$read_start_tmp/) + { + my $extension_tmp = $match_reverse; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$left-$overlap, $merged_match_back_pos{$ln}+$left+$overlap, ""; + $extension = reverse($extension_tmp); + + $read_count++; + goto LAST1_BACK; + } + } + } + foreach my $read_start_tmp (keys %read_start) + { + my @test_tmp = split //, $test; + my @read_start_tmp = split //, $read_start_tmp; + + my $d = '0'; + my $next = ""; + my $N = '0'; + + while ($d < length($read_start_tmp)) + { + if ($test_tmp[$d] eq $read_start_tmp[$d]) + { + } + elsif ($read_start_tmp[$d] eq ".") + { + } + elsif ($test_tmp[$d] eq "." && $N < '5') + { + $N++; + } + elsif ($next eq "" && $hp_seed_assemble eq "" && length($read) > $insert_size+200) + { + $next = "yes"; + } + elsif ($next eq "yes") + { + $next = "yes2"; + } + elsif ($next eq "yes2") + { + $next = "yes3"; + } + elsif ($containX_short_start2 > 0) + { + goto STAR_BACK; + } + else + { + next NO_MATCH_BACK; + } + $d++ + } + + my $extension_tmp = $match_reverse; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$left-$overlap, $merged_match_back_pos{$ln}+$left+$overlap, ""; + $extension = reverse($extension_tmp); + + $read_count++; + goto LAST1_BACK; + } + } + +STAR_BACK: if ($containX_short_start2 > 0) + { + if (exists($merged_match_back2{$ln})) + { + foreach my $line (keys %read_start_b) + { + my @read_start_b_sub; + if ($use_regex_back eq "yes") + { + @read_start_b_sub = build_partialb $line; + } + else + { + push @read_start_b_sub, $line; + } + + my $found_seq = '0'; + my $match2 = $match; + + foreach my $read_start_b_sub (@read_start_b_sub) + { + $found_seq = $match2 =~ s/.$read_start_b_sub/+/; + if ($found_seq > 0) + { + my @ext = split /\+/, $match2; + my $extension5 = $ext[1]; + $extension5 =~ tr/ATCG/TAGC/; + $extension = $extension5; + $read_count++; + goto LAST1_BACK; + } + } + } + } + elsif (exists($merged_match_back1{$ln})) + { + foreach my $line (keys %read_start) + { + my @read_start_sub; + if ($use_regex_back eq "yes") + { + @read_start_sub = build_partialb $line; + } + else + { + push @read_start_sub, $line; + } + my $found_seq = '0'; + my $match4 = $match; + + foreach my $read_start_sub (@read_start_sub) + { + $found_seq = $match4 =~ s/(.)$read_start_sub/$1+/g; + if ($found_seq > 1) + { + my $pos = $merged_match_back_pos{$ln}; + my $match4b = substr $match, 0, -$pos; + $match4b =~ s/(.+)$line/$1+/; + my @ext = split /\+/, $match4; + $extension = reverse($ext[0]); + $read_count++; + $forward = "yes"; + goto LAST1_BACK; + } + elsif ($found_seq > 0) + { + my @ext = split /\+/, $match4; + $extension = reverse($ext[0]); + $read_count++; + $forward = "yes"; + goto LAST1_BACK; + } + } + } + } + } + next NO_MATCH_BACK; + +LAST1_BACK: my $id_match_end = substr $id_match, -1, 1; + my $id_match_tmp = substr $id_match, 0, -1,; + + if ($hp_seed_assemble ne "") + { + my $check = ""; + my $test; + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp >= -$position_back && $pos_tmp < (-$position_back+(length($match)-length($extension)))) + { + if (exists($merged_match_back1{$ln})) + { + my $hp_SNP_read = substr $match, length($extension)+$position_back+$pos_tmp, 1; + if ($pos_tmp < -100000000000000) + { + $test = $match; + print OUTPUT5 $test." WWW00\n"; + print OUTPUT5 $hp_SNP_read." WWW001\n"; + print OUTPUT5 $SNPs{$pos_tmp}." WWW002\n"; + } + if ($hp_SNP_read eq $SNPs{$pos_tmp}) + { + $check = "yes"; + } + else + { + delete $accepted_SNPs_back{$ln}; + next NO_MATCH_BACK; + } + } + elsif (exists($merged_match_back2{$ln})) + { + my $match_tmp = reverse($match); + $match_tmp =~ tr/ACTG/TGAC/; + my $hp_SNP_read = substr $match_tmp, length($extension)+$position_back+$pos_tmp, 1; + if ($pos_tmp < -100000000000000) + { + $test = $match_tmp; + print OUTPUT5 $test." WWW01\n"; + print OUTPUT5 $hp_SNP_read." WWW011\n"; + print OUTPUT5 $SNPs{$pos_tmp}." WWW012\n"; + } + if ($hp_SNP_read eq $SNPs{$pos_tmp}) + { + $check = "yes"; + } + else + { + delete $accepted_SNPs_back{$ln}; + next NO_MATCH_BACK; + } + } + } + } + if (exists($accepted_SNPs_back{$ln})) + { + $check = "yes"; + } + if ($check ne "yes") + { + next NO_MATCH_BACK; + } + } + if (index ($id_match_tmp, $id) eq "-1" && $extension ne "NOOO") + { + my $nuc_exlude = "test"; + if ($use_quality ne "" && $SNR_read2 eq "") + { + $extension =~ tr/1|2|3|4/N/; + } + elsif ($use_quality ne "") + { + $extension =~ tr/1234/ACTG/; + } + if ($yuyu_option_A_back eq "A" || $yuyu_option_C_back eq "C" || $yuyu_option_T_back eq "T" || $yuyu_option_G_back eq "G") + { + $nuc_exlude = substr $extension, 0, 1; + } + if ($extension ne "NOOO" && $extension ne " " && $extension ne "" && $nuc_exlude ne $yuyu_option_A_back && $nuc_exlude ne $yuyu_option_C_back && $nuc_exlude ne $yuyu_option_T_back && $nuc_exlude ne $yuyu_option_G_back) + { + push @matches2, $id_match.",".$extension.","."".",".$match.",".""; + $extensions2{$extension} = $id_match; + push @extensions2, $extension; + if ($forward eq "yes") + { + $extensions2b{$id_match} = $extension; + } + else + { + $extensions1b{$id_match} = $extension; + } + if ($save_reads ne "") + { + my $add_read = substr $id_match, 0, -1; + if (exists($save_reads{$add_read})) + { + } + else + { + $save_reads{$add_read} = undef; + if ($save_reads eq "2") + { + my $add_read2 = $map_ids{$add_read}; + print OUTPUT10 $add_read2."\/1\n"; + print OUTPUT11 $add_read2."\/2\n"; + } + else + { + print OUTPUT10 ">".$add_read."\/1\n"; + print OUTPUT11 ">".$add_read."\/2\n"; + } + if (exists($hash{$add_read})) + { + my @add_read = split /,/,$hash{$add_read}; + my $forward = $add_read[0]; + my $reverse = $add_read[1]; + if ($use_quality ne "") + { + $forward =~tr/1234/ACTG/; + $reverse =~tr/1234/ACTG/; + } + print OUTPUT10 $forward."\n"; + print OUTPUT11 $reverse."\n"; + } + } + } + } + } + next NO_MATCH_BACK; + } + else + { + my $test = substr $match, -$merged_match_back_pos{$ln}-$right-$overlap, $overlap; + + my $last_nuc = substr $match, -$merged_match_back_pos{$ln}-$right-$overlap, 1; + if ($heteroplasmy ne "" && ($last_nuc eq "1" || $last_nuc eq "2" || $last_nuc eq "3" || $last_nuc eq "4" || $last_nuc eq "N") && $SNR_read_back eq "") + { + next NO_MATCH_BACK; + } + if ($use_quality ne "") + { + $test =~ tr/1234/ACTG/; + my $countN = $match =~ tr/1234//; + if ($countN > length($match)*0.35) + { + next NO_MATCH_BACK; + } + } + + if ($test eq $read_start) + { + my $extension_tmp = $match; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$right-$overlap, $merged_match_back_pos{$ln}+$right+$overlap, ""; + $extension = reverse($extension_tmp); + + $read_count++; + goto FOUND_BACK; + } + my $test_N1 = $read_start =~ tr/N/\./; + my $test_N2 = $test =~ tr/N/\./; + + if ($test_N1 > 0 && $test_N2 eq '0' && $test_N1 < $overlap/6) + { + if ($test =~ m/$read_start/) + { + my $extension_tmp = $match; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$right-$overlap, $merged_match_back_pos{$ln}+$right+$overlap, ""; + $extension = reverse($extension_tmp); + + $read_count++; + goto FOUND_BACK; + } + } + elsif ($test_N2 > 0 && $test_N1 eq '0' && $test_N2 < $overlap/6) + { + if ($read_start =~ m/$test/) + { + my $extension_tmp = $match; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$right-$overlap, $merged_match_back_pos{$ln}+$right+$overlap, ""; + $extension = reverse($extension_tmp); + + $read_count++; + goto FOUND_BACK; + } + } + + my @test_tmp = split //, $test; + my @read_start_tmp = split //, $read_start; + + my $d = '0'; + my $next = ""; + my $N = '0'; + + while ($d < length($read_start)) + { + if ($test_tmp[$d] eq $read_start_tmp[$d]) + { + } + elsif ($read_start_tmp[$d] eq ".") + { + } + elsif ($test_tmp[$d] eq "." && $N < '5') + { + $N++; + } + elsif ($next eq "" && $hp_seed_assemble eq "" && $use_regex_back eq "yes") + { + $next = "yes"; + } + elsif ($next eq "yes") + { + $next = "yes2"; + } + elsif ($next eq "yes2") + { + $next = "yes3"; + } + elsif ($containX_short_start2 > 0) + { + goto STAR_BACK2; + } + else + { + next NO_MATCH_BACK; + } + $d++ + } + + my $extension_tmp = $match; + substr $extension_tmp, -$merged_match_back_pos{$ln}-$right-$overlap, $merged_match_back_pos{$ln}+$right+$overlap, ""; + $extension = reverse($extension_tmp); + + $read_count++; + goto FOUND_BACK; + } + +STAR_BACK2: if ($containX_short_start2 > 0) + { + foreach my $line (keys %read_start) + { + my @read_start_sub; + if ($hp_seed_assemble eq "" && $use_regex_back ne "") + { + @read_start_sub = build_partialb $line; + } + else + { + push @read_start_sub, $line; + } + my $found_seq = '0'; + my $match4 = $match; + + foreach my $read_start_sub (@read_start_sub) + { + $found_seq = $match4 =~ s/(.)$read_start_sub/$1+/g; + if ($found_seq > 1) + { + my $pos = $merged_match_back_pos{$ln}; + my $match4b = substr $match, 0, -$pos; + $match4b =~ s/(.+)$line/$1+/; + my @ext = split /\+/, $match4; + $extension = reverse($ext[0]); + $read_count++; + goto FOUND_BACK; + } + elsif ($found_seq > 0) + { + my @ext = split /\+/, $match4; + $extension = reverse($ext[0]); + $read_count++; + goto FOUND_BACK; + } + } + } + } + next NO_MATCH_BACK; + +FOUND_BACK: if ($last_chance_back eq "yes") + { + next NO_MATCH_BACK; + } + if ($hp_seed_assemble ne "" && $last_chance_back ne "yes") + { + my $check = ""; + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp >= -$position_back && $pos_tmp < (-$position_back+(length($match)-length($extension)))) + { + if (exists($merged_match_back1{$ln})) + { + my $hp_SNP_read = substr $match, length($extension)+$position_back+$pos_tmp-1, 1; + if ($hp_SNP_read eq $SNPs{$pos_tmp}) + { + $check = "yes"; + } + else + { + delete $accepted_SNPs_back{$ln}; + next NO_MATCH_BACK; + } + } + } + } + if ($check eq "yes") + { + $accepted_SNPs_back{$ln} = undef; + } + } + if ($extension ne "NOOO") + { + my $id_match_b = $id_match; + my $id_match_end = substr $id_match_b, -1, 1,"",; + + if (exists($hash{$id_match_b})) + { + my @id_match_b = split /,/, $hash{$id_match_b}; + + if ($id_match_end eq "1") + { + $match_pair = $id_match_b[1]; + } + elsif ($id_match_end eq "2") + { + $match_pair = $id_match_b[0]; + } + else + { + next NO_MATCH_BACK; + } + chomp($match_pair); + if ($encrypt eq "yes") + { + $match_pair = decrypt $match_pair; + } + + my $match_pair3 = $match_pair; + $match_pair3 =~ tr/ATCG/TAGC/; + $match_pair2 = reverse ($match_pair3); + + my $match_pair_middle2 = substr $match_pair2, $right+5, $overlap+$extra_overlap; + if (length($read) < $insert_size*$insert_range && $hp_seed_assemble ne "") + { + $match_pair_middle2 = substr $match_pair2, $right, $overlap; + } + if ($use_quality ne "") + { + $match_pair_middle2 =~ tr/1234/TGAC/; + } + my $countN = $match_pair_middle2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $extra_regex_tmp = ""; + if ($countN > 0 && $countN < length($match_pair_middle2)/5) + { + $extra_regex_tmp = "yes"; + } + elsif ($countN >= length($match_pair_middle2)/5) + { + next NO_MATCH_BACK; + } + + if ($test_dot > 5 || $extra_regex_tmp eq "yes") + { + if (exists($hash_read_short_start{$match_pair_middle2})) + { + if ($hp_seed_assemble ne "") + { + check_HP_pos_back($hash_read_short_start{$match_pair_middle2}, $F, $position_back, $match_pair2, $ln); + } + $counttest1++; + $extension_match = ""; + goto SKIP_BACK; + } + my @match_pair_middle_sub = split //, $match_pair_middle2; + $extension_match = ""; + my $gh = '0'; + foreach my $line (keys %read_short_start_tmp) + { + my @line = split //,$line; + $gh = '0'; + $line =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + +CHECK_PAIR_BACK: while ($gh < length($line)-length($match_pair_middle2)) + { + my $d = '0'; + my $next = ""; + my $th = '0'; + my $N = '0'; + if ($use_regex ne "yes") + { + $next = "yes3"; + } + + while ($d < length($match_pair_middle2)) + { + $th = $d + $gh; + if ($match_pair_middle_sub[$d] eq $line[$th]) + { + } + elsif ($line[$th] eq ".") + { + } + elsif ($match_pair_middle_sub[$d] eq ".") + { + $N++; + } + elsif ($next eq "" && $hp_seed_assemble eq "") + { + $next = "yes"; + } + elsif ($next eq "yes") + { + $next = "yes2"; + } + elsif ($next eq "yes2") + { + $next = "yes3"; + } + else + { + $gh++; + goto CHECK_PAIR_BACK; + } + $d++ + } + goto CHECK_PAIR0_BACK; + } + } + $extension_match = "NOOO"; + +CHECK_PAIR0_BACK: $counttest1++; + + if ($hp_seed_assemble ne "") + { + check_HP_pos_back($gh, $F, $position_back, $match_pair2, $ln); + } + goto SKIP_BACK; + } + else + { + my $found_seq = '0'; + my $line_tmpb; + if (exists($hash_read_short_start{$match_pair_middle2})) + { + $extension_match = ""; + if ($hp_seed_assemble ne "") + { + check_HP_pos_back($hash_read_short_start{$match_pair_middle2}, $F, $position_back, $match_pair2, $ln); + } + goto SKIP_BACK; + } + if ($test_dot > 3 && $mmbr > 19) + { + my $match_pair_middle_tmp = $match_pair_middle2; +CHECK_PAIR2_BACK: foreach my $line (keys %hash_read_short_start_dot) + { + my $found_seq = '0'; + my $star_first = substr $line, 0, 1; + if ($star_first eq "*") + { + $line = substr $line, 1; + $match_pair_middle_tmp = substr $match_pair_middle_tmp, 1; + } + $found_seq = $match_pair_middle_tmp =~ s/$line/+/; + $line_tmpb = $read_short_end; + if ($found_seq > 0) + { + last CHECK_PAIR2_BACK; + } + } + if ($found_seq > 0) + { + $extension_match = ""; + if ($hp_seed_assemble ne "") + { + check_HP_pos_back($hash_read_short_start_dot{$match_pair_middle2}, $F, $position_back, $match_pair2, $ln); + } + goto SKIP_BACK; + } + else + { + $extension_match = "NOOO"; + } + } + elsif ($test_dot > 0 || $test_star > '0' || $mmbr < 19) + { + foreach my $line (keys %read_short_start_tmp) + { + $found_seq = '0'; + $found_seq = $line =~ s/$match_pair_middle2/+/; + if ($found_seq > 0) + { + $extension_match = ""; + if ($hp_seed_assemble ne "") + { + my @split = split /\+/, $line; + my $th = length($split[0]); + check_HP_pos_back($th, $F, $position_back, $match_pair2, $ln); + } + goto SKIP_BACK; + } + } + } + $extension_match = "NOOO"; + } +SKIP_BACK: + if ($hp_seed_assemble ne "") + { + my $check = ""; + foreach my $pos_tmp (keys %SNPs) + { + if ($pos_tmp >= -$position_back && $pos_tmp < (-$position_back+$overlap)) + { + $check = "yes"; + } + } + if (exists($accepted_SNPs_back{$ln})) + { + $check = "yes"; + } + if (exists($accepted_SNPs_pair_back{$ln})) + { + $check = "yes"; + } + if ($check ne "yes") + { + $extension_match = "NOOO"; + } + } + my $nuc_exlude = "test"; + if ($yuyu_option_A_back eq "A" || $yuyu_option_C_back eq "C" || $yuyu_option_T_back eq "T" || $yuyu_option_G_back eq "G") + { + $nuc_exlude = substr $extension, 0, 1; + } + if ($extension_match ne "NOOO" && $extension ne " " && $extension ne "" && $nuc_exlude ne $yuyu_option_A_back && $nuc_exlude ne $yuyu_option_C_back && $nuc_exlude ne $yuyu_option_T_back && $nuc_exlude ne $yuyu_option_G_back) + { + $read_ex++; + push @matches1, $id_match.",".$extension.","."".",".$match.",".$match_pair; + + if ($extension ne " " && $extension ne "") + { + if ($use_quality ne "" && $SNR_read2 eq "") + { + $extension =~ tr/1|2|3|4/N/; + } + elsif ($use_quality ne "") + { + $extension =~ tr/1234/ACTG/; + } + $extensions2{$extension} = $id_match; + $extensions2b{$id_match} = $extension; + push @extensions2, $extension; + if ($save_reads ne "") + { + my $add_read = substr $id_match, 0, -1; + if (exists($save_reads{$add_read})) + { + } + else + { + $save_reads{$add_read} = undef; + if ($save_reads eq "2") + { + my $add_read2 = $map_ids{$add_read}; + print OUTPUT10 $add_read2."\/1\n"; + print OUTPUT11 $add_read2."\/2\n"; + } + else + { + print OUTPUT10 ">".$add_read."\/1\n"; + print OUTPUT11 ">".$add_read."\/2\n"; + } + if (exists($hash{$add_read})) + { + my @add_read = split /,/,$hash{$add_read}; + my $forward = $add_read[0]; + my $reverse = $add_read[1]; + if ($use_quality ne "") + { + $forward =~tr/1234/ACTG/; + $reverse =~tr/1234/ACTG/; + } + print OUTPUT10 $forward."\n"; + print OUTPUT11 $reverse."\n"; + } + } + } + } + } + + } + } + } + %extensions = (%extensions1, %extensions2); + %extensionsb = (%extensions1b, %extensions2b); + @extensions = (@extensions1, @extensions2); + @matches = (@matches1, @matches2); + + my $ext = '0'; + my $ext_total_back = '0'; + foreach (@extensions) + { + $ext++; + } + $ext_total_back = $ext; + + if ($y > $startprint2) + { + print OUTPUT5 "\n".$read_count ." READ_COUNT_BACK\n"; + print OUTPUT5 $read_ex ." READ_EX_BACK\n"; + print OUTPUT5 $ext ." EXTENSIONS_BACK\n"; + } + + undef %SNR_length; + + if ($y > $startprint && $print_log eq '2') + { + foreach my $matches (@matches) + { + my @matchesb; + undef @matchesb; + @matchesb = split /,/, $matches; + + print OUTPUT5 $matchesb[3]." MATCH\n"; + print OUTPUT5 $matchesb[4]." MATCH_PAIR\n"; + } + } + my $id_original = $id; + + my $l = '0'; + my $best_extension = ""; + my $SNP = ""; + my $A_SNP = '0'; + my $C_SNP = '0'; + my $T_SNP = '0'; + my $G_SNP = '0'; + my $position_SNP = $position_back; + my $pos_SNP = '0'; + + my $A_SNP2 = '0'; + my $C_SNP2 = '0'; + my $T_SNP2 = '0'; + my $G_SNP2 = '0'; + my $position_SNP2 = $position_back; + my $pos_SNP2 = '0'; + + my $A_SNP3 = '0'; + my $C_SNP3 = '0'; + my $T_SNP3 = '0'; + my $G_SNP3 = '0'; + my $position_SNP3 = $position_back; + my $pos_SNP3 = '0'; + + if ($SNR_read_back ne "" && $SNR_read_back2 ne "" && $last_chance_back ne "yes" && $ext < 5 && $indel_split_back eq '0') + { + goto AFTER_EXT_BACK; + } + undef @extensions_group1; + undef @extensions_group2; + undef %extensions_group1; + undef %extensions_group2; + undef @extensions_group3; + undef @extensions_group4; + undef %extensions_group3; + undef %extensions_group4; + + $no_SNR = ""; + undef %extensions_backup; + undef @extensions_backup; + my %extensions_backup = %extensions; + my @extensions_backup = @extensions; + +SPLIT_BACK: + if ($split eq "yes_back") + { + %extensions = %extensions_group2; + @extensions = @extensions_group2; + $split = "yes2_back"; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + } + elsif ($split eq "yes2_back") + { + @extensions = @extensions_group3; + %extensions = %extensions_group3; + $split = "yes3_back"; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + } + elsif ($split eq "yes3_back") + { + @extensions = @extensions_group4; + %extensions = %extensions_group4; + $split = "yes4_back"; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + } + elsif ($split eq "yes4_back") + { + @extensions = @extensions_group1; + %extensions = %extensions_group1; + $split = "yes5_back"; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + if ($count_split eq '1') + { + $split = ""; + } + } + $id = $id_original; + $position_back = $position_back{$id}; + $l = '0'; + $best_extension = ""; + $SNP = ""; + $A_SNP = '0'; + $C_SNP = '0'; + $T_SNP = '0'; + $G_SNP = '0'; + $position_SNP = $position_back; + $pos_SNP = '0'; + + $A_SNP2 = '0'; + $C_SNP2 = '0'; + $T_SNP2 = '0'; + $G_SNP2 = '0'; + $position_SNP2 = $position_back; + $pos_SNP2 = '0'; + + $A_SNP3 = '0'; + $C_SNP3 = '0'; + $T_SNP3 = '0'; + $G_SNP3 = '0'; + $position_SNP3 = $position_back; + $pos_SNP3 = '0'; + my %SNR_count_back; + my %extensions_new; + my @extensions_new; + undef %SNR_count_back; + undef %extensions_new; + undef @extensions_new; + my $SNR_test = ""; + my $most_SNR = '0'; + $most_SNR2 = '0'; + + if ($SNR_read_back ne "" && $split eq "" && $SNR_read_back2 ne "" && $use_quality eq "") + { + $SNR_test = "yes2_back"; + if ($SNR_read_back eq "yes") + { + $SNR_test = "yes2_back"; + if ($y > $startprint2) + { + print OUTPUT5 $SNR_test." SNR_TEST_BACK\n"; + } + my $G = '0'; + my $G2 = '0'; + my $no_SNR1 = ""; + my $second_round; +SNR1_BACK: foreach my $extensions (@extensions) + { + my @chars = split//, $extensions; + my $e = '0'; + my $check_only_SNR = $extensions =~ s/$SNR_nucleo_back/$SNR_nucleo_back/g; + if (length($extensions) eq $check_only_SNR) + { + $G2++; + } + $G++; + if ($second_round eq "") + { + $no_SNR1 = "yes"; + } + while ($SNR_nucleo_back eq $chars[$e] || $no_SNR1 eq "") + { + if ($SNR_nucleo_back ne $chars[$e]) + { + $no_SNR1 = "yes"; + } + $e++; + } + if ($e < length($extensions)) + { + $SNR_count_back{$extensions} = $e; + $SNR_length{$e} .= exists $SNR_length{$e} ? ",$extensions" : $extensions; + } + $no_SNR1 = ""; + } + my $SNR_length_count2 = '0'; + my $SNR_length_reads = ""; + my $first = ""; + foreach my $SNR_length (keys %SNR_length) + { + my $SNR_length_count = $SNR_length{$SNR_length} =~ tr/,/,/; + if ($SNR_length_count > $SNR_length_count2) + { + $SNR_length_count2 = $SNR_length_count; + $SNR_length_reads = $SNR_length{$SNR_length}; + $most_SNR = $SNR_length; + $first = $SNR_length_count; + } + } + my @SNR_length = split/,/, $SNR_length_reads; + my $repetitive_test = substr $read_short_start2, 0, 10; + my $SNR_checkSNR = $repetitive_test =~ s/$SNR_nucleo_back/$SNR_nucleo_back/g; + + $SNR_length_count2 = '0'; + foreach my $SNR_length (keys %SNR_length) + { + my $SNR_length_count = $SNR_length{$SNR_length} =~ tr/,/,/; + if ($SNR_length_count > $SNR_length_count2 && $SNR_length_count ne $first) + { + $SNR_length_count2 = $SNR_length_count; + $most_SNR2 = $SNR_length; + } + } +print OUTPUT5 $first." 1st ".$SNR_length_count2." 2d ".$G." G\n"; + if ($first < 0.8*$G && $second_round eq "" && $first > 0.35*$G && $SNR_length_count2 > 0.35*($G-$G2)) + { + if ($y > $startprint2) + { + print OUTPUT5 "SPLIT_SNR\n"; + $no_SNR = "yes"; + } + goto NUCLEO0_BACK; + } + elsif ($first < 0.8*$G && $second_round eq "") + { + if ($y > $startprint2) + { + print OUTPUT5 $most_SNR." MOST_SNR SECOND ROUND\n"; + } + $second_round = "yes"; + $no_SNR1 = ""; + $most_SNR = '0'; + $G = '0'; + undef %SNR_count_back; + undef %SNR_length; + goto SNR1_BACK; + } + else + { + foreach my $SNRie (@SNR_length) + { + if (exists($extensions{$SNRie})) + { + $extensions_new{$SNRie} = $extensions{$SNRie}; + push @extensions_new, $SNRie; + } + } + %extensions = %extensions_new; + @extensions = @extensions_new; + } + } + + if ($SNR_back{$id} eq "yes2_double_back") + { + $SNR_test = "yes2_double_back"; + foreach my $extensions (@extensions) + { + my @chars = split//, $extensions; + my $e = '0'; + + if ($SNR_nucleo_back eq $chars[$e].$chars[$e+1] ) + { + while ($SNR_nucleo_back eq $chars[$e].$chars[$e+1]) + { + my $tempie = reverse $extensions; + chop $tempie; + chop $tempie; + $extensions = reverse $tempie; + $e++; + $e++; + } + } + else + { + while ($SNR_nucleo_back eq $chars[$e+1].$chars[$e]) + { + my $tempie = reverse $extensions; + chop $tempie; + chop $tempie; + $extensions = reverse $tempie; + $e++; + $e++; + } + } + $extensions_new{$extensions} = $extensions{$extensions}; + push @extensions_new, $extensions; + if ($e < length($extensions)) + { + $SNR_count_back{$extensions} = $e; + } + } + %extensions = %extensions_new; + @extensions = @extensions_new; + } + delete $SNR_back{$id}; + } +NUCLEO0_BACK: + if ($SNR_read_back ne "") + { + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + } + my $extra_l = '0'; + $highest_all_freq = '0'; + +NUCLEO_BACK: while ($l < $read_length - ($overlap+$left-1) + $extra_l && $l < 149) + { + my $A = '0'; + my $C = '0'; + my $T = '0'; + my $G = '0'; + my $skipped = '0'; + + if ($SNR_read_back ne "" && $l > 0 && $split eq "") + { + my $best_extension_tmp8 = $best_extension; + $best_extension_tmp8 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $last_nuc = substr $best_extension_tmp8, -1; + my $arrSize1 = @extensions; + + if ($last_nuc ne "." && $arrSize1 > 4) + { + my @extensions_tmp; + undef @extensions_tmp; + foreach my $extensions (@extensions) + { + my @chars = split //, $extensions; + if ($chars[$l-1] eq $last_nuc || length($extensions) < $l) + { + push @extensions_tmp, $extensions; + } + } + + my $arrSize2 = @extensions_tmp; + if ($arrSize1 ne $arrSize2) + { + undef @extensions; + @extensions = @extensions_tmp; + my $best_extension_dot = $best_extension_tmp8 =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\./\./; + + if ($best_extension_dot > 0) + { + $l = 0; + $best_extension = ""; + $SNP = ""; + goto NUCLEO0_BACK; + } + } + } + } + my $q = '2'; + foreach my $extensions (@extensions) + { + my @chars = split//, $extensions; + + if ($chars[$l] eq "A") + { + $A++; + } + elsif ($chars[$l] eq "C") + { + $C++; + } + elsif ($chars[$l] eq "T") + { + $T++; + } + elsif ($chars[$l] eq "G") + { + $G++; + } + elsif ($chars[$l] eq "N") + { + $skipped++; + } + } + my $c = '2.8'; + if ($ext > 22 && $SNR_read eq "") + { + $c = '3.7'; + } + if ($SNR_read2 ne "" && $l < 5) + { + $c = '1.4'; + } + if ($ext > 6 && ($type ne "chloro" || $extensions_before ne "") && $SNR_read_back eq "") + { + $c = '5'; + } + if ($ext > 22 && $SNR_read eq "" && $type ne "chloro") + { + $c = '6.5'; + } + if ($ext > 38 && $SNR_read eq "" && $type ne "chloro") + { + $c = '8.4'; + } + if ($ext > 100 && $SNR_read eq "" && $type eq "mito_plant") + { + $c = '13'; + } + if ($ext > $average_coverage_ext*6 && $SNR_read eq "" && $type eq "mito_plant") + { + $c = '25'; + } + if ($repetitive_detect_back ne "" && $ext < 23 && $SNR_read_back eq "") + { + $c = '7'; + } + if ($repetitive_detect_back ne "" && $ext >= 23 && $SNR_read_back eq "") + { + $c = '9'; + } + if ($repetitive_detect_back2 eq "yes" || ($repetitive_detect_back ne "" && $ext > 120)) + { + $c = '15'; + } + if ($extensions_before eq "yes" && $type eq "mito_plant") + { + $c = '13'; + } + if ($extensions_before eq "yes" && $ext > $average_coverage_ext*0.5 && $type eq "mito_plant") + { + $c = '23'; + } + if ($type eq "mito_plant") + { + $c += 2; + } + my $z = '1'; + my $v = '6'; + my $s = '3'; + if ($split ne "" && $indel_split_back eq 0) + { + $v = '10'; + $s = '2'; + $z = '0'; + } + my $hp = 10000000000; + if ($heteroplasmy ne "" && $SNR_read_back eq "" && $repetitive_detect_back eq "" && $hp_seed_assemble eq "") + { + $hp = ($A + $T + $G + $C)*$heteroplasmy; + $q = 1.5; + } + if ($hp_seed_assemble ne "") + { + $v = '40'; + $q = '1.4'; + if ($c < 5 && $SNR_read eq "") + { + $c = '5'; + } + } + if ($heteroplasmy ne "" && $SNR_read_back eq "" && $SNP eq "" && $l > 7 && $highest_all_freq > $heteroplasmy/2) + { + chop($best_extension); + last NUCLEO_BACK; + } + if ($A > ($C + $T + $G)*$c && (($C <= $hp && $T <= $hp && $G <= $hp) || ($C < 2 && $T < 2 && $G < 2)) && (($A > $s && ($ext)/($A+$T+$G+$C+$skipped) < $q) || ($A > $z && $l < $v && ($C + $T + $G) eq 0 && ($ext)/($A+$T+$G+$C+$skipped) < $q))) + { + $best_extension = $best_extension."A"; + $allele_percentage_back{$l} = $A."+".$C."+".$T."+".$G; + $allele_total_back{$l} = $A+$C+$T+$G; + $highest_all_freq = ($C+$T+$G)/($A+$C+$T+$G); + } + elsif ($C > ($A + $T + $G)*$c && (($A <= $hp && $T <= $hp && $G <= $hp) || ($A < 2 && $T < 2 && $G < 2)) && (($C > $s && ($ext)/($A+$T+$G+$C+$skipped) < $q) || ($C > $z && $l < $v && ($A + $T + $G) eq 0 && ($ext)/($A+$T+$G+$C+$skipped) < $q))) + { + $best_extension = $best_extension."C"; + $allele_percentage_back{$l} = $A."+".$C."+".$T."+".$G; + $allele_total_back{$l} = $A+$C+$T+$G; + $highest_all_freq = ($A+$T+$G)/($A+$C+$T+$G); + } + elsif ($T > ($A + $C + $G)*$c && (($C <= $hp && $A <= $hp && $G <= $hp) || ($C < 2 && $A < 2 && $G < 2)) && (($T > $s && ($ext)/($A+$T+$G+$C+$skipped) < $q) || ($T > $z && $l < $v && ($A + $C + $G) eq 0 && ($ext)/($A+$T+$G+$C+$skipped) < $q))) + { + $best_extension = $best_extension."T"; + $allele_percentage_back{$l} = $A."+".$C."+".$T."+".$G; + $allele_total_back{$l} = $A+$C+$T+$G; + $highest_all_freq = ($A+$C+$G)/($A+$C+$T+$G); + } + elsif ($G > ($C + $T + $A)*$c && (($C <= $hp && $T <= $hp && $A <= $hp) || ($C < 2 && $T < 2 && $A < 2)) && (($G > $s && ($ext)/($A+$T+$G+$C+$skipped) < $q) || ($G > $z && $l < $v && ($C + $T + $A) eq 0 && ($ext)/($A+$T+$G+$C+$skipped) < $q))) + { + $best_extension = $best_extension."G"; + $allele_percentage_back{$l} = $A."+".$C."+".$T."+".$G; + $allele_total_back{$l} = $A+$C+$T+$G; + $highest_all_freq = ($A+$T+$C)/($A+$C+$T+$G); + } + elsif ($hp_seed_assemble ne "" && $SNR_read_back ne "" && $l > 5) + { + last NUCLEO_BACK; + } + elsif ((($heteroplasmy ne "" && $l eq '0') || $SNP_active_back eq "yes" || ($SNR_read_back ne "" && $l > 0) || ($extensions_before eq "yes" && $ext_before ne "yes")) && $SNP eq "" && ($A + $T + $G + $C) > 4 && (($l < 15 && $split eq "") || ($l < 11 && $split ne "")) && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + delete $SNP_active_back{$id}; + $SNP = "yes_back"; + $A_SNP = $A; + $C_SNP = $C; + $T_SNP = $T; + $G_SNP = $G; + $position_SNP += $l; + $pos_SNP = $l; + + my @IUPAC = IUPAC($A,$C,$T,$G); + $allele_percentage_back{$l} = $A."+".$C."+".$T."+".$G; + $best_extension = $best_extension.$IUPAC[0]; + + $allele_total_back{$l} = $A+$C+$T+$G; + } + elsif ($hp_seed_assemble ne "" && $PCR_free ne "yes") + { + last NUCLEO_BACK; + } + elsif ((($heteroplasmy ne "" && $l < $pos_SNP+10) || $heteroplasmy eq "") && $SNP eq "yes_back" && ($A + $T + $G + $C) > 4 && $l < 15 && ($ext)/($A+$T+$G+$C+$skipped) < $q && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + if ($heteroplasmy ne "") + { + my %remove_extension_mismatch_tmp = mismatch (\%extensions, \%remove_extension_mismatch, $best_extension); + %remove_extension_mismatch = (%remove_extension_mismatch, %remove_extension_mismatch_tmp) + } + $SNP = "yes2_back"; + $A_SNP2 = $A; + $C_SNP2 = $C; + $T_SNP2 = $T; + $G_SNP2 = $G; + $position_SNP2 += $l; + $pos_SNP2 = $l; + + my @IUPAC = IUPAC($A,$C,$T,$G); + $allele_percentage_back{$l} = $A."+".$C."+".$T."+".$G; + $best_extension = $best_extension.$IUPAC[0]; + + $allele_total_back{$l} = $A+$C+$T+$G; + } + elsif ((($heteroplasmy ne "" && $l < $pos_SNP2+10) || $heteroplasmy eq "") && $SNP eq "yes2_back" && ($A + $T + $G + $C) > 4 && $l < 15 && ($ext)/($A+$T+$G+$C+$skipped) < $q && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + if ($heteroplasmy ne "") + { + my %remove_extension_mismatch_tmp = mismatch (\%extensions, \%remove_extension_mismatch, $best_extension); + %remove_extension_mismatch = (%remove_extension_mismatch, %remove_extension_mismatch_tmp) + } + $SNP = "yes3_back"; + $A_SNP3 = $A; + $C_SNP3 = $C; + $T_SNP3 = $T; + $G_SNP3 = $G; + $position_SNP3 += $l; + $pos_SNP3 = $l; + + my @IUPAC = IUPAC($A,$C,$T,$G); + $allele_percentage_back{$l} = $A."+".$C."+".$T."+".$G; + $best_extension = $best_extension.$IUPAC[0]; + + $allele_total_back{$l} = $A+$C+$T+$G; + } + elsif ((($heteroplasmy ne "" && $l < $pos_SNP3+10) || $heteroplasmy eq "") && $SNP eq "yes3_back" && ($A + $T + $G + $C) > 4 && $l < 15 && ($ext)/($A+$T+$G+$C+$skipped) < $q && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + if ($heteroplasmy ne "") + { + my %remove_extension_mismatch_tmp = mismatch (\%extensions, \%remove_extension_mismatch, $best_extension); + %remove_extension_mismatch = (%remove_extension_mismatch, %remove_extension_mismatch_tmp) + } + $SNP = "yes4_back"; + $A_SNP3 = $A; + $C_SNP3 = $C; + $T_SNP3 = $T; + $G_SNP3 = $G; + $position_SNP3 += $l; + $pos_SNP3 = $l; + + my @IUPAC = IUPAC($A,$C,$T,$G); + $allele_percentage_back{$l} = $A."+".$C."+".$T."+".$G; + $best_extension = $best_extension.$IUPAC[0]; + + $allele_total_back{$l} = $A+$C+$T+$G; + } + elsif ((($heteroplasmy ne "" && $l < $pos_SNP3+10) || $heteroplasmy eq "") && $SNP eq "yes4_back" && ($pos_SNP ne 0 || ($pos_SNP3 > $pos_SNP+12 && $l > 11 ) || ($extensions_before eq "yes" && $l > 9)) && $split eq "" && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + $SNP = "yes5_back"; + my $g = $l; + my $pos_SNP_tmp = $pos_SNP; + if ($pos_SNP3 > $pos_SNP+12) + { + $pos_SNP_tmp = $pos_SNP3; + } + if ($extensions_before ne "yes" && $pos_SNP ne 0 && $SNR_read2 eq "") + { + while ($g > $pos_SNP_tmp) + { + chop($best_extension); + $g--; + } + } + last NUCLEO_BACK; + } + + elsif (($SNP eq "yes4_back" && $pos_SNP eq 0 && $l < 15) || ($indel_split_skip_back ne "yes" && $l eq 0 && $ext > 4) && ($A + $T + $G + $C) > 4 && ($ext)/($A+$T+$G+$C+$skipped) < $q) + { + if ($y > $startprint2) + { + print OUTPUT5 $SNP." SNP\n"; + } + if ($SNR_test ne "" && $no_SNR ne "yes") + { + $l = '0'; + $best_extension = ""; + $no_SNR = "yes"; + %extensions = %extensions_backup; + @extensions = @extensions_backup; + + goto SPLIT_BACK; + } + if ($SNP eq "") + { + $A_SNP = $A; + $C_SNP = $C; + $T_SNP = $T; + $G_SNP = $G; + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSIONll\n"; + print OUTPUT5 $A_SNP." A\n"; + print OUTPUT5 $C_SNP." C\n"; + print OUTPUT5 $T_SNP." T\n"; + print OUTPUT5 $G_SNP." G\n"; + } + } + if ($split ne "") + { + $read_new = $read_new1; + $best_extension = ""; + delete $seed{$id}; + $seed{$id} = $read_new; + + delete $last_chance_back{$id}; + $noback = "stop"; + $noback{$id} = "stop"; + if ($noforward ne "stop") + { + goto SEED; + } + else + { + delete $seed{$id}; + $id = $id_original; + $split = ""; + goto AFTER_EXT_BACK; + } + } + $best_extension = ""; + $split = "yes_back"; + + undef @firstSNP_max; + my $w = 0.035; + if ($type eq "mito_plant") + { + $w = 0.015; + } + + if ($A_SNP >= ($C_SNP+$A_SNP+$T_SNP+$G_SNP)*$w && $A_SNP > 0) + { + if (exists($yuyu_option_back{$id.'A'})) + { + if ($y > $startprint2) + { + print OUTPUT5 "YUYU_A_EXISTS\n"; + } + } + else + { + push @firstSNP_max, "A"; + } + } + if ($C_SNP >= ($C_SNP+$A_SNP+$T_SNP+$G_SNP)*$w && $C_SNP > 0) + { + if (exists($yuyu_option_back{$id.'C'})) + { + if ($y > $startprint2) + { + print OUTPUT5 "YUYU_C_EXISTS\n"; + } + } + else + { + push @firstSNP_max, "C"; + } + } + if ($T_SNP >= ($C_SNP+$A_SNP+$T_SNP+$G_SNP)*$w && $T_SNP > 0) + { + if (exists($yuyu_option_back{$id.'T'})) + { + if ($y > $startprint2) + { + print OUTPUT5 "YUYU_T_EXISTS\n"; + } + } + else + { + push @firstSNP_max, "T"; + } + } + if ($G_SNP >= ($C_SNP+$A_SNP+$T_SNP+$G_SNP)*$w && $G_SNP > 0) + { + if (exists($yuyu_option_back{$id.'G'})) + { + if ($y > $startprint2) + { + print OUTPUT5 "YUYU_G_EXISTS\n"; + } + } + else + { + push @firstSNP_max, "G"; + } + } + + $count_split = @firstSNP_max; + my $count_split_tmp = '0'; + + if ($count_split eq '2') + { + $delete_third = "yes_back"; + $delete_second = "yes_back"; + } + if ($count_split eq '3') + { + $delete_third = "yes_back"; + } + if ($count_split eq '1') + { + $delete_third = "yes_back"; + $delete_first = "yes_back"; + $delete_second = "yes_back"; + $split = "yes4_back"; + } + + foreach my $firstSNP_max (@firstSNP_max) + { + foreach my $extensions_tmp (@extensions) + { + my @chars = split//, $extensions_tmp; + if ($chars[0] eq $firstSNP_max && $count_split_tmp eq '0') + { + $extensions_group1{$extensions_tmp} = $extensions{$extensions_tmp}; + push @extensions_group1, $extensions_tmp; + } + elsif ($chars[0] eq $firstSNP_max && $count_split_tmp eq '1') + { + $extensions_group2{$extensions_tmp} = $extensions{$extensions_tmp}; + push @extensions_group2, $extensions_tmp; + } + elsif ($chars[0] eq $firstSNP_max && $count_split_tmp eq '2') + { + $extensions_group3{$extensions_tmp} = $extensions{$extensions_tmp}; + push @extensions_group3, $extensions_tmp; + } + elsif ($chars[0] eq $firstSNP_max && $count_split_tmp eq '3') + { + $extensions_group4{$extensions_tmp} = $extensions{$extensions_tmp}; + push @extensions_group4, $extensions_tmp; + } + } + $count_split_tmp++; + } + if ($y > $startprint2) + { + print OUTPUT5 $count_split." COUNT_SPLIT\n"; + } + goto SPLIT_BACK; + } + else + { + last NUCLEO_BACK; + } + $l++; + } + my $last_nucleo = substr $best_extension, -1; + $last_nucleo =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + while ($last_nucleo eq '.') + { + chop($best_extension); + $last_nucleo = substr $best_extension, -1; + $last_nucleo =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + } + if ($split eq "yes2_back" || ($split eq "yes3_back" && $count_split > 2) || ($split eq "yes4_back" && $count_split > 3)) + { + if ($split eq "yes2_back") + { + $best_extension2 = $best_extension; + } + elsif ($split eq "yes3_back") + { + $best_extension3 = $best_extension; + } + elsif ($split eq "yes4_back") + { + $best_extension4 = $best_extension; + } + my $best_extension_tmp = $best_extension; + $best_extension_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + if ($y > $startprint2) + { + if ($split eq "yes2_back") + { + print OUTPUT5 "GROUP2\n"; + foreach my $extensions_tmp (@extensions_group2) + { + print OUTPUT5 $extensions_tmp."\n"; + } + print OUTPUT5 $best_extension_tmp." BEST_EXTENSION_BACK2\n\n"; + } + elsif ($split eq "yes3_back") + { + print OUTPUT5 "GROUP3\n"; + foreach my $extensions_tmp (@extensions_group3) + { + print OUTPUT5 $extensions_tmp."\n"; + } + print OUTPUT5 $best_extension_tmp." BEST_EXTENSION_BACK3\n\n"; + } + elsif ($split eq "yes4_back") + { + print OUTPUT5 "GROUP4\n"; + foreach my $extensions_tmp (@extensions_group4) + { + print OUTPUT5 $extensions_tmp."\n"; + } + print OUTPUT5 $best_extension_tmp." BEST_EXTENSION_BACK4\n\n"; + } + } + if ((length($best_extension2) < 3 || (length($best_extension2) < 6 && $ext > 15)) && $repetitive_detect_back eq "" && $before_back eq "yesss") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION_BACK 2\n\n"; + } + $delete_first = "yes_back"; + goto SPLIT_BACK; + } + if ($type eq "chloroplst" && length($best_extension2) > 20) + { + my $best_extension2_reverse2 = $best_extension2; + $best_extension2_reverse2 =~ tr/ATCG/TAGC/; + my $best_extension2_reverse = reverse($best_extension2_reverse2); + $best_extension2_reverse =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + my $read_cp = $read; + my $read_start_rev_tmp = reverse($read_start); + $read_start_rev_tmp =~ tr/ATCG/TAGC/; + $read_start_rev_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + if (length($read) > $genome_range_low) + { + } + my $found_seq_cp2 = $read_cp =~ s/$best_extension2_reverse/$best_extension2_reverse/; + my $found_seq_cp4 = $read_cp =~ s/$read_start_rev_tmp/$read_start_rev_tmp/; + + if ($found_seq_cp2 > 0 && $found_seq_cp4 > 0) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 2 (CP)\n\n"; + } + $delete_first = "yes_back"; + goto SPLIT_BACK; + } + } + my $end_SNR = substr $read_start, 0,4; + $end_SNR =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $GGGG = $end_SNR =~ tr/G/G/; + my $TTTT = $end_SNR =~ tr/T/T/; + my $CCCC = $end_SNR =~ tr/C/C/; + my $AAAA = $end_SNR =~ tr/A/A/; + if (($GGGG eq '4' || $TTTT eq '4' || $CCCC eq '4' || $AAAA eq '4') && $before_back eq "yesss") + { + $GGGG = $best_extension2 =~ tr/G/G/; + $TTTT = $best_extension2 =~ tr/T/T/; + $CCCC = $best_extension2 =~ tr/C/C/; + $AAAA = $best_extension2 =~ tr/A/A/; + if ($GGGG eq length($best_extension2) || $TTTT eq length($best_extension2) || $CCCC eq length($best_extension2) || $AAAA eq length($best_extension2)) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 2 (SNR)\n\n"; + } + $delete_first = "yes_back"; + goto SPLIT_BACK; + } + } + if (length($best_extension2) > 9 && $before_back eq "yesss") + { + my $end_tmp = substr $read_start, 0, -10; + if (length($best_extension2) < 15) + { + $end_tmp = substr $read_start, 0, -length($best_extension2)+5; + } + $end_tmp = reverse($best_extension2).$end_tmp; + $end_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $s = '0'; + my $foundit = ""; + while ($s < length($end_tmp)-$overlap) + { + my $end_tmp_d = substr $end_tmp, -($s+$overlap), $overlap; + if ($containX_short_start2 > 0) + { + my $star = $end_tmp_d =~ tr/\*/\*/; + + $end_tmp_d = substr $end_tmp, -($s+$overlap+($star*2)), $overlap+($star*2); + my $star2 = $end_tmp_d =~ tr/\*/\*/; + while ($star2 > $star) + { + $end_tmp_d = substr $end_tmp, -($s+$overlap+($star*2)), $overlap+($star*2); + $star = $star2; + $star2 = $end_tmp_d =~ tr/\*/\*/; + } + } + + my %end_tmp_d = build_partial3b $end_tmp_d, "reverse_back"; + foreach my $end_tmp_d (keys %end_tmp_d) + { + if (exists($hash2b{$end_tmp_d})) + { + $foundit = "yes_back"; + } + elsif (exists($hash2c{$end_tmp_d})) + { + $foundit = "yes_back"; + } + } + $s++; + } + if ($foundit ne "yes_back") + { + print OUTPUT5 "\nDELETE BEST EXTENSION 2 (no reverse match)\n\n"; + $delete_first = "yes_back"; + goto SPLIT_BACK; + } + } + if ($split eq "yes2_back") + { + delete $seed{$id}; + $id = "b".$position_back."_".$id; + $id_split1 = $id; + } + elsif ($split eq "yes3_back") + { + $id = "bb".$position_back."_".$id; + $id_split2 = $id; + } + elsif ($split eq "yes4_back") + { + $id = "bbb".$position_back."_".$id; + $id_split3 = $id; + } + + $position_back{$id} = $position_back; + $position{$id} = $position; + $contig_count{$id} = $contig_count{$id_original}; + + my $count_contig_tmp = $contig_count; + while ($count_contig_tmp > 0) + { + $contig_gap_min{$id."_".$count_contig_tmp} = $contig_gap_min{$id_original."_".$count_contig_tmp}; + $contig_gap_max{$id."_".$count_contig_tmp} = $contig_gap_max{$id_original."_".$count_contig_tmp}; + $count_contig_tmp--; + } + if (exists($old_id{$id_original})) + { + $old_id{$id} = $old_id{$id_original}; + } + if (exists($old_rep{$id_original})) + { + $old_rep{$id} = $old_rep{$id_original}; + } + if (exists($old_rep_old{$id_original})) + { + $old_rep_old{$id} = $old_rep_old{$id_original}; + } + if (exists($noforward{$id_original})) + { + $noforward{$id} = $noforward; + } + if (exists($seed_split{$id_original})) + { + $seed_split{$id} = undef; + } + if (exists($nosecond{$id_original})) + { + $nosecond{$id} = undef; + } + if (exists($before_shorter_skip{$id_original})) + { + $before_shorter_skip{$id} = undef; + } + if (exists($jump_rep{$id_original})) + { + $jump_rep{$id} = undef; + } + if (exists($jump_rep_because_stuck{$id_original})) + { + $jump_rep_because_stuck{$id} = undef; + } + if (exists($rep_return{$id_original})) + { + $rep_return{$id} = $rep_return{$id_original}; + } + if (exists($rep_return_back{$id_original})) + { + $rep_return_back{$id} = $rep_return_back{$id_original}; + } + if (exists($last_ref_seq_forward{$id_original})) + { + $last_ref_seq_forward{$id} = $last_ref_seq_forward{$id_original}; + } + if (exists($last_ref_seq_back{$id_original})) + { + $last_ref_seq_back{$id} = $last_ref_seq_back{$id_original}; + } + if (exists($large_variance_forward{$id_original})) + { + $large_variance_forward{$id} = $large_variance_forward{$id_original}; + } + if (exists($large_variance_back{$id_original})) + { + $large_variance_back{$id} = $large_variance_back{$id_original}; + } + if (exists($large_variance_length_forward{$id_original})) + { + $large_variance_length_forward{$id} = $large_variance_length_forward{$id_original}; + } + if (exists($large_variance_length_back{$id_original})) + { + $large_variance_length_back{$id} = $large_variance_length_back{$id_original}; + } + if (exists($last_ref_pos_forward{$id_original})) + { + $last_ref_pos_forward{$id} = $last_ref_pos_forward{$id_original}; + } + if (exists($last_ref_pos_back{$id_original})) + { + $last_ref_pos_back{$id} = $last_ref_pos_back{$id_original}; + } + if (exists($no_next_seed{$id_original})) + { + $no_next_seed{$id} = $no_next_seed{$id_original}; + } + } + elsif ($split eq "yes5_back" || (($variance_detection eq "yes" || $heteroplasmy ne "") && $best_extension ne "" && $repetitive_detect_back eq "")) + { + if ($split eq "" && $repetitive_detect_back eq "" && ($variance_detection eq "yes" || $heteroplasmy ne "")) + { + goto REFERENCE_BACK; + } + $best_extension1 = $best_extension; + + if ($y > $startprint2) + { + print OUTPUT5 "GROUP1\n"; + foreach my $extensions_tmp (@extensions_group1) + { + print OUTPUT5 $extensions_tmp."\n"; + } + print OUTPUT5 $best_extension1." BEST_EXTENSION_BACK1\n\n"; + } + if ((length($best_extension1) < 3 || (length($best_extension1) < 6 && $ext > 15)) && $before_back eq "yessss" && $repetitive_detect_back eq "") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 1\n\n"; + } + if ($delete_first eq "yes_back" && $last_chance_back eq "yes") + { + delete $seed{$id}; + $delete_first = "yes2_back"; + $noback = "stop"; + $noback{$id} = "stop"; + goto FINISH; + } + elsif ($delete_first eq "yes_back") + { + $best_extension = ""; + goto AFTER_EXT_BACK; + } + else + { + goto SEED; + } + } + if ($type eq "chlorop" && length($best_extension1) > 20) + { + my $best_extension1_reverse2 = $best_extension1; + $best_extension1_reverse2 =~ tr/ATCG/TAGC/; + my $best_extension1_reverse = reverse($best_extension1_reverse2); + $best_extension1_reverse =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + my $read_cp = $read; + my $read_start_rev_tmp = reverse($read_start); + $read_start_rev_tmp =~ tr/ATCG/TAGC/; + if (length($read) > $genome_range_low) + { + $read_cp = substr $read, 0, -5000; + $read_cp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + } + + my $found_seq_cp2 = $read_cp =~ s/$best_extension1_reverse/$best_extension1_reverse/; + my $found_seq_cp4 = $read_cp =~ s/$read_start_rev_tmp/$read_start_rev_tmp/; + + if ($found_seq_cp2 > 0 && $found_seq_cp4 > 0) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 1 (CP)\n\n"; + } + if ($delete_first eq "yes_back") + { + delete $seed{$id}; + $delete_first = "yes2_back"; + $noback = "stop"; + $noback{$id} = "stop"; + goto FINISH; + } + goto SEED; + } + } + my $end_SNR = substr $read_start,0, 4; + $end_SNR =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $GGGG = $end_SNR =~ tr/G/G/; + my $TTTT = $end_SNR =~ tr/T/T/; + my $CCCC = $end_SNR =~ tr/C/C/; + my $AAAA = $end_SNR =~ tr/A/A/; + if (($GGGG eq '4' || $TTTT eq '4' || $CCCC eq '4' || $AAAA eq '4') && $before_back eq "yesss") + { + $GGGG = $best_extension1 =~ tr/G/G/; + $TTTT = $best_extension1 =~ tr/T/T/; + $CCCC = $best_extension1 =~ tr/C/C/; + $AAAA = $best_extension1 =~ tr/A/A/; + if ($GGGG eq length($best_extension1) || $TTTT eq length($best_extension1) || $CCCC eq length($best_extension1) || $AAAA eq length($best_extension1)) + { + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE BEST EXTENSION 1 (SNR)\n\n"; + } + if ($delete_first eq "yes_back") + { + delete $seed{$id}; + $delete_first = "yes2_back"; + $noback = "stop"; + $noback{$id} = "stop"; + goto FINISH; + } + goto SEED; + } + } + if (length($best_extension1) > 9 && $before_back eq "yessss") + { + my $end_tmp = substr $read_start,0, -10; + if (length($best_extension1) < 15) + { + $end_tmp = substr $read_start, 0, -length($best_extension1)+5; + } + $end_tmp = reverse($best_extension1).$end_tmp; + $end_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $s = '0'; + my $foundit = ""; + while ($s < length($end_tmp)-$overlap) + { + my $end_tmp_d = substr $end_tmp, -($s+$overlap), $overlap; + + if ($containX_short_start2 > 0) + { + my $star = $end_tmp_d =~ tr/\*//; + + $end_tmp_d = substr $end_tmp, -($s+$overlap+($star*2)), $overlap+($star*2); + my $star2 = $end_tmp_d =~ tr/\*//; + while ($star2 > $star) + { + $end_tmp_d = substr $end_tmp, -($s+$overlap+($star*2)), $overlap+($star*2); + $star = $star2; + $star2 = $end_tmp_d =~ tr/\*//; + } + } + my %end_tmp_d = build_partial3b $end_tmp_d, "reverse_back"; + foreach my $end_tmp_d (keys %end_tmp_d) + { + if (exists($hash2b{$end_tmp_d})) + { + $foundit = "yes_back"; + } + elsif (exists($hash2c{$end_tmp_d})) + { + $foundit = "yes_back"; + } + } + $s++; + } + if ($foundit ne "yes_back") + { + print OUTPUT5 "\nDELETE BEST EXTENSION 1 (no reverse match)\n\n"; + if ($delete_first eq "yes_back") + { + delete $seed{$id}; + $delete_first = "yes2_back"; + $noback = "stop"; + $noback{$id} = "stop"; + goto FINISH; + } + goto SEED; + } + } + my $read_part_back_tmp = substr $read_short_start2, 0, $read_length; + my $star_check = $read_part_back_tmp =~ tr/\*/\*/; + if ($SNP_active_back eq "yes" && ($before_back eq "yes" || $extensions_before eq "yes" || $platform eq "ion") && $count_split eq '2' && ($star_check eq 0 || $star_check eq "" || $platform eq "ion") && $ext < $average_coverage_ext*3) + { + my @check_deletion = check_deletion($best_extension1, $best_extension2, $best_extension_old1, $best_extension_old2, "", "back"); + $best_extension = $check_deletion[0]; + + if ($best_extension ne "") + { + if ($y > $startprint2) + { + print OUTPUT5 reverse($best_extension)." BEST_EXTENSION_BACK_DEL\n"; + } + $deletion_back = "yes"; + $read_new = $read_new1; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + delete $indel_split_back{$id}; + if ($heteroplasmy eq "") + { + goto INDEL_BACK; + } + else + { + $split = ""; + } + } + } +REFERENCE_BACK: + if ((((length($best_extension1) > 4 && length($best_extension2) > 4) || (length($best_extension_old1) > 4 && length($best_extension_old2) > 4)) && $reference ne "" && $SNP_active_back eq "yes" && $repetitive_detect_back eq "" && $deletion_back eq "") || (($variance_detection eq "yes" || $heteroplasmy ne "") && $best_extension ne "" && $repetitive_detect_back eq "")) + { + my $p = 0; + my $p_prev = '-100'; + if ($y > $startprint2) + { + print OUTPUT5 "CHECK_REFERENCE_BACK\n\n"; + } + my $ref_part_prev; + my $found_further_back; + my %ref_id3; + my @ref_id3; + my $further = ""; + my $first_last_seq_ref; + my $last_seq_ref; + my $check_back_length = '800'; + +CHECK_REF_BACK: while ($p < $check_back_length && $p < length($read)) + { + if ($found_further_back ne "yes") + { + undef @ref_id3; + undef %ref_id3; + } + my $read_short_start2_tmp = substr $read, 0, $check_back_length+200; + if (exists($last_ref_seq_back{$id}) && $first_last_seq_ref ne "yes") + { + $read_short_start2_tmp = $last_ref_seq_back{$id}; + if (length($read_short_start2_tmp)+$p < 0) + { + $first_last_seq_ref = "yes"; + $read_short_start2_tmp = $read_short_start2; + } + } + my $ref_part2 = substr $read_short_start2_tmp, $p, 30; + my $star2; + if ($containX_short_start2 > 0) + { + my $star = $ref_part2 =~ tr/\*/\*/; + + $ref_part2 = substr $read_short_start2_tmp, $p, 30+($star*2); + $star2 = $ref_part2 =~ tr/\*/\*/; + while ($star2 > $star) + { + $ref_part2 = substr $read_short_start2_tmp, $p, 30+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $ref_part2 =~ tr/\*/\*/; + } + } + $ref_part2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my %ref_part = build_partial3b $ref_part2, "back"; + if ($found_further_back eq "yes") + { + $p--; + } + my $ref_loc = $p; + if ($found_further_back eq "") + { + foreach my $ref_part_tmp (keys %ref_part) + { + if (exists($hashref{$ref_part_tmp})) + { + my $ref_id3 = $hashref{$ref_part_tmp}; + my $ref_id2 = substr $ref_id3, 1; + my @ref_id3_tmp; + my @ref_id3_tmp2; + + if ($found_further_back eq "") + { + @ref_id3_tmp = split /,/, $ref_id2; + } + else + { + $ref_part_tmp = $ref_part_prev; + } + + foreach (@ref_id3_tmp) + { + if (($_ < $last_ref_pos_back{$id}+5000 && $_ > $last_ref_pos_back{$id}-5000) || $last_ref_pos_back{$id} eq "" || $first_150 ne "") + { + $ref_id3{$_} = $ref_part_tmp; + push @ref_id3_tmp2, $_; + } + } + push @ref_id3, @ref_id3_tmp2; + + if ($y > $startprint2) + { + print OUTPUT5 $ref_part_tmp." EXISTS ".$ref_loc." LOC ".@ref_id3_tmp2." LOC_REF\n"; + } + } + } + } + if ($y > $startprint2) + { + } +CHECKED_BACK_REF_BACK: + if (@ref_id3 eq 1) + { + foreach my $ref_id (@ref_id3) + { + $last_ref_pos_back{$id} = $ref_id; + my $prev_loc1 = $ref_id - $ref_loc - 1; + if ($prev_loc1 eq "-1") + { + $prev_loc1 = '0'; + } + my $prev_loc_star = $ref_id - $ref_loc - 1; + my @delete; + if (exists($last_ref_seq_back{$id}) && $first_last_seq_ref ne "yes") + { + } + else + { + foreach my $var_pos_tmp (keys %variance_back) + { + my @split = split /\+/, $var_pos_tmp; + + my $var_pos = $split[1]; + if ($split[0] eq $id) + { + if ($var_pos < $ref_id) + { + $prev_loc1 += $variance_back{$var_pos_tmp}; + $prev_loc_star += $variance_back{$var_pos_tmp}; + } + if ($ref_id - $ref_loc -1 + $check_back_length < $var_pos) + { + push @delete, $var_pos_tmp; + } + } + } + } + foreach my $delete (@delete) + { + delete $variance_back{$delete}; + } + + my $test_star = substr $read_short_start2_tmp, 0, $p-29; + my $star2b = '0'; + if ($containX_short_end2 > 0) + { + $star2b = $test_star =~ tr/\*/\*/; + } + + $last_seq_ref = $hashref2{$prev_loc1+1}; + if ($prev_loc1 < 0) + { + if ($heteroplasmy ne "" && $first_150 eq "") + { + $noback{$id} = "stop"; + $noback = "stop"; + } + else + { + if ($split eq "") + { + print OUTPUT5 $best_extension." BEST_EXTENSION_BACK_HP\n"; + delete $last_ref_seq_back{$id}; + goto AFTER_EXT_BACK; + } + goto INDEL_BACKa; + } + } + if (exists($hashref2{$prev_loc1})) + { + my $prev_loc1_tmp = $prev_loc1; + my $ref_check_star; + my $ref_check; + my $j = '149'; + my $e = '1'; + + if ($star2 > 1000000000000000000000) + { + if (exists($hashref2{$prev_loc_star})) + { + $prev_loc1_tmp = $prev_loc_star; + if ($y > $startprint2) + { + print OUTPUT5 $prev_loc1_tmp." EXISTSREF_star\n"; + } + } + } + + while ($j > 28) + { + if (exists($hashref2{$prev_loc1_tmp-$j})) + { + $ref_check .= $hashref2{$prev_loc1_tmp-$j}; + } + elsif ($heteroplasmy ne "" && $prev_loc1_tmp-$j < 0 && ($j > length($best_extension) || ($split ne "" && ($j > length($best_extension1) || $j > length($best_extension2))))) + { + if ($prev_loc1_tmp <= 30) + { + $ref_check = substr $hashref2{1}, 0, $prev_loc1_tmp; + last; + } + } + elsif ($y > 15 && $heteroplasmy ne "" && ($j < length($best_extension) || ($split ne "" && ($j < length($best_extension1) || $j < length($best_extension2))))) + { + my $best_extension_tmp = substr $best_extension, 0, length($ref_check); + if ($split ne "") + { + $best_extension1 = substr $best_extension1, 0, length($ref_check); + $best_extension2 = substr $best_extension2, 0, length($ref_check); + } + $best_extension = $best_extension_tmp; + $noback{$id} = "stop"; + $noback = "stop"; + } + $j -= 30; + } + + + $ref_check =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + if ($y > $startprint2) + { + print OUTPUT5 reverse($ref_check)." EXISTSREF1 ".$prev_loc1." PREV_LOC1\n"; + } + + my $best_extension1_tmp; + my $best_extension2_tmp; + my $best_extension3_tmp; + my $best_extension4_tmp; + if (length($best_extension_old1) > length($best_extension1)) + { + $best_extension1_tmp = $best_extension_old1; + } + else + { + $best_extension1_tmp = $best_extension1; + } + if (length($best_extension_old2) > length($best_extension2)) + { + $best_extension2_tmp = $best_extension_old2; + } + else + { + $best_extension2_tmp = $best_extension2; + } + if (length($best_extension_old3) > length($best_extension3)) + { + $best_extension3_tmp = $best_extension_old3; + } + else + { + $best_extension3_tmp = $best_extension3; + } + if (length($best_extension_old4) > length($best_extension4)) + { + $best_extension4_tmp = $best_extension_old4; + } + else + { + $best_extension4_tmp = $best_extension4; + } + + my $best_extension1_part = reverse(substr $best_extension1_tmp, 0, 25); + my $best_extension2_part = reverse(substr $best_extension2_tmp, 0, 25); + my $best_extension3_part = reverse(substr $best_extension3_tmp, 0, 25); + my $best_extension4_part = reverse(substr $best_extension4_tmp, 0, 25); + my $best_extension1_partb = reverse(substr $best_extension1_tmp, 5, 25); + my $best_extension2_partb = reverse(substr $best_extension2_tmp, 5, 25); + my $best_extension3_partb = reverse(substr $best_extension3_tmp, 5, 25); + my $best_extension4_partb = reverse(substr $best_extension4_tmp, 5, 25); + + my $reference_guided1 = '0'; + my $reference_guided2 = '0'; + my $reference_guided3 = '0'; + my $reference_guided4 = '0'; + my $best_extension_del = $best_extension; + if (($variance_detection eq "yes" || $heteroplasmy ne "") && $split eq "" && $repetitive_detect_back eq "") + { + my $del_detect = '1'; +DEL_DETECT_BACK: if ($deletion_back eq "yes" && $del_detect eq '1') + { + my $best_extension1_tmp = $best_extension; + $best_extension1_tmp =~ tr/\*//d; + $best_extension = $best_extension1_tmp; + } + elsif ($deletion_back eq "yes" && $del_detect eq '2') + { + my $best_extension2_tmp = $best_extension_del; + $best_extension2_tmp =~ s/\*.//g; + $best_extension = $best_extension2_tmp; + } + + if ($y > $startprint2) + { + print OUTPUT5 $best_extension." BEST_EXTENSION_BACK\n\n"; + } + + my $ref_check_tmp = reverse($ref_check); + + if (exists($large_variance_back{$id})) + { + $ref_check_tmp = reverse($hashref2{$large_variance_back{$id}-150}.$hashref2{$large_variance_back{$id}-120}.$hashref2{$large_variance_back{$id}-90}.$hashref2{$large_variance_back{$id}-60}.$hashref2{$large_variance_back{$id}-30}); + print OUTPUT5 $ref_check_tmp." EXISTSREF1_VAR_DETECT_BACK\n\n"; + } + + my $best_extension_tmp = $best_extension; + my $deletion_found = ""; + my $save_seq_ref; + my $max_SNP = 1; + + if (exists($large_variance_back{$id}) && length($best_extension) < 13) + { + $best_extension_tmp = reverse(reverse($best_extension).$read_start); + } +VAR_START_BACK: + my @line = split //, $best_extension_tmp; + my @ref_check = split //, $ref_check_tmp; + my $gh = '0'; + my $th = '0'; + my $var_SNP_detect; + +VAR_CHECK_BACK: while ($gh < length($ref_check_tmp)-length($best_extension_tmp) && $SNR_read_back eq "") + { + my $d = '0'; + my $next = '0'; + my @pos; + my $pos; + my $pos_first = ""; + my $v = '4'; + my $AF; + my $DP; + if (length($best_extension_tmp) > 15) + { + $v = '6'; + } + + while ($d < length($best_extension_tmp)) + { + $th = $d + $gh; + if ($line[$d] eq $ref_check[$th]) + { + if ($next ne '0' && (($pos-$prev_loc1-$gh) < $d-$v || $d eq length($best_extension_tmp)-1)) + { + foreach my $pos_tmp (@pos) + { + my $position_tmp; + my $nuc_other_than_ref = ""; + if ($AF eq "") + { + $position_tmp = $pos_first; + $pos_first = $pos_tmp; + } + else + { + $position_tmp = $prev_loc1-$pos_tmp; + } + print OUTPUT5 $position_tmp." TESh1\n"; + my @nucs_count = split /\+/, $allele_percentage_back{$position_tmp}; + my ($nucs_alt2, $nucs_alt_array) = IUPAC_reverse($line[$position_tmp]); + + my @nucs_alt = @$nucs_alt_array; + my $nucs_alt; + my $no_variance = ""; + my $deduct_duplications_first = '0'; + my $deduct_low_quality_next_nuc_first = '0'; + my $deduct_mismatch_nuc_first = '0'; + + print OUTPUT5 $allele_percentage_back{$position_tmp}." TESh2\n"; + my $s = '0'; + my @nucs_order = ('A','C','T','G'); + my $AF_high = '0'; + my %allele_ordered; + + if (($nucs_count[0]+$nucs_count[1]+$nucs_count[2]+$nucs_count[3]) > 0) + { + my $h = '0.001'; + foreach (@nucs_alt) + { + my $nuc_tmp = $_-1; + my $AF_tmp = sprintf("%.3g",$nucs_count[$nuc_tmp]/($nucs_count[0]+$nucs_count[1]+$nucs_count[2]+$nucs_count[3])); + if ($nucs_order[$nuc_tmp] ne $ref_check[$pos_tmp-$prev_loc1] && @nucs_alt < 3) + { + $nuc_other_than_ref = $nucs_order[$nuc_tmp]; + } + print OUTPUT5 $nucs_order[$nuc_tmp]." TESh3\n"; + + if (exists($allele_ordered{$AF_tmp})) + { + $allele_ordered{$AF_tmp+0.001} = $nuc_tmp; + $h = '0.002'; + } + else + { + $allele_ordered{$AF_tmp} = $nuc_tmp; + } + print OUTPUT5 $AF_tmp." TESh4\n"; + } + my %deduct_duplications; + my $count_allele = '1'; + my $deduct_duplications_total; + foreach my $allele_ordered (sort {$b <=> $a} keys %allele_ordered) + { + if ($s eq '0') + { + $nucs_alt = $nucs_order[$allele_ordered{$allele_ordered}]; + + my %count_ext_first; + undef %count_ext_first; + +HP_NEXT_BACK: foreach my $ext_tmp (keys %extensionsb) + { + my $first_nuc = substr $extensionsb{$ext_tmp}, $position_tmp, 1; + my $match_ext; + my $id_match_b = $ext_tmp; + my $id_match_end = substr $id_match_b, -1, 1,""; + + if (exists($remove_extension_mismatch{$extensionsb{$ext_tmp}})) + { + if ($position_tmp >= $remove_extension_mismatch{$extensionsb{$ext_tmp}}) + { + $deduct_mismatch_nuc_first++; + next HP_NEXT_BACK; + } + } + + if (exists($extensions2b{$ext_tmp}) && $position_tmp > 0) + { + my $next_nuc = substr $extensionsb{$ext_tmp}, $position_tmp-1, 1; + if ($next_nuc eq "N" || $next_nuc eq "") + { + $deduct_low_quality_next_nuc_first++; + next HP_NEXT_BACK; + } + } + elsif (exists($extensions1b{$ext_tmp})) + { + my $next_nuc = substr $extensionsb{$ext_tmp}, $position_tmp+1, 1; + if ($next_nuc eq "N" || $next_nuc eq "") + { + $deduct_low_quality_next_nuc_first++; + next HP_NEXT_BACK; + } + } + + if (exists($hash{$id_match_b})) + { + my @id_match_b = split /,/, $hash{$id_match_b}; + + if ($id_match_end eq "1") + { + $match_ext = $id_match_b[0]; + } + elsif ($id_match_end eq "2") + { + $match_ext = $id_match_b[1]; + } + if ($use_quality ne "") + { + $match_ext =~ tr/1|2|3|4/N/; + } + if (exists($count_ext_first{length($extensionsb{$ext_tmp})})) + { + my $count = $count_ext_first{length($extensionsb{$ext_tmp})}+1; + $count_ext_first{length($extensionsb{$ext_tmp})} = $count; + } + else + { + $count_ext_first{length($extensionsb{$ext_tmp})} = '0'; + } + } + } + + foreach my $count_ext_first (keys %count_ext_first) + { + if ($count_ext_first{$count_ext_first} > ($nucs_count[$allele_ordered{$allele_ordered}]/($read_length-$right-$overlap))*3) + { + $deduct_duplications_first += $count_ext_first{$count_ext_first}-($nucs_count[$allele_ordered{$allele_ordered}]/($read_length-$right-$overlap)); + } + } + $deduct_duplications{0} = $nucs_count[$allele_ordered{$allele_ordered}]-$deduct_duplications_first-$deduct_low_quality_next_nuc_first-$deduct_mismatch_nuc_first; + $deduct_duplications_total = $deduct_duplications_first+$deduct_low_quality_next_nuc_first+$deduct_mismatch_nuc_first; + } + else + { + $nucs_alt .= ",".$nucs_order[$allele_ordered{$allele_ordered}]; + print OUTPUT5 $allele_ordered." ALLELE\n"; + my %count_ext; + undef %count_ext; + my $deduct_low_quality_next_nuc = '0'; + my $deduct_mismatch_nuc = '0'; + +HP_NEXT_BACK2: foreach my $ext_tmp (keys %extensionsb) + { + my $first_nuc = substr $extensionsb{$ext_tmp}, $position_tmp, 1; + my $match_ext; + my $match_ext_pair; + if ($first_nuc eq $nucs_order[$allele_ordered{$allele_ordered}]) + { + my $id_match_b = $ext_tmp; + my $id_match_end = substr $id_match_b, -1, 1,""; + + if (exists($remove_extension_mismatch{$extensionsb{$ext_tmp}})) + { + if ($position_tmp >= $remove_extension_mismatch{$extensionsb{$ext_tmp}}) + { + $deduct_mismatch_nuc++; + next HP_NEXT_BACK2; + } + } + + if (exists($extensions2b{$ext_tmp}) && $position_tmp > 0) + { + my $next_nuc = substr $extensionsb{$ext_tmp}, $position_tmp-1, 1; + if ($next_nuc eq "N" || $next_nuc eq "") + { + $deduct_low_quality_next_nuc++; + next HP_NEXT_BACK2; + } + } + elsif (exists($extensions1b{$ext_tmp})) + { + my $next_nuc = substr $extensionsb{$ext_tmp}, $position_tmp+1, 1; + if ($next_nuc eq "N" || $next_nuc eq "") + { + $deduct_low_quality_next_nuc++; + next HP_NEXT_BACK2; + } + } + + if (exists($hash{$id_match_b})) + { + my @id_match_b = split /,/, $hash{$id_match_b}; + + if ($id_match_end eq "1") + { + $match_ext = $id_match_b[0]; + $match_ext_pair = $id_match_b[1]; + } + elsif ($id_match_end eq "2") + { + $match_ext = $id_match_b[1]; + $match_ext_pair = $id_match_b[0]; + } + if ($use_quality ne "") + { + $match_ext =~ tr/1|2|3|4/N/; + } + + if (exists($count_ext{length($extensionsb{$ext_tmp})})) + { + my $count = $count_ext{length($extensionsb{$ext_tmp})}+1; + $count_ext{length($extensionsb{$ext_tmp})} = $count; + } + else + { + $count_ext{length($extensionsb{$ext_tmp})} = '0'; + } + } + } + } + my $deduct_duplications = '0'; + foreach my $count_ext (keys %count_ext) + { + if ($count_ext{$count_ext} > ($nucs_count[$allele_ordered{$allele_ordered}]/($read_length-$right-$overlap))*3) + { + $deduct_duplications += $count_ext{$count_ext}-$nucs_count[$allele_ordered{$allele_ordered}]/($read_length-$right-$overlap); + } + } + $deduct_duplications{$count_allele} = $nucs_count[$allele_ordered{$allele_ordered}]-$deduct_duplications-$deduct_low_quality_next_nuc-$deduct_mismatch_nuc; + $deduct_duplications_total += ($deduct_duplications+$deduct_low_quality_next_nuc+$deduct_mismatch_nuc); + + $count_allele++; + } + $s++; + } + my $new_total_nuc = $nucs_count[0]+$nucs_count[1]+$nucs_count[2]+$nucs_count[3]-$deduct_duplications_total; + $no_variance = "yes"; + foreach my $deduct_duplications_tmp (sort {$a <=> $b} keys %deduct_duplications) + { + my $AF_tmp = sprintf("%.3g",($deduct_duplications{$deduct_duplications_tmp})/$new_total_nuc); + print OUTPUT5 $AF_tmp." NO_VARIANCE\n"; + if ($deduct_duplications_tmp eq '0') + { + $AF = $AF_tmp; + } + elsif ($AF_tmp >= $heteroplasmy) + { + $AF .= ",".$AF_tmp; + $no_variance = ""; + } + } + } + + $DP = $allele_total_back{$position_tmp}; + + my $nuc_in_ext_hp = substr $best_extension, $position_tmp, 1; + + if ($hp_seed_assemble eq "" && $no_variance ne "yes") + { + if (exists($variance_all{$pos_tmp}) && $hp_back ne "yes") + { + my $one; + my $two; + if ($variance_all{$pos_tmp} =~ m/^\S*\s\S*\s\S*\t(\S*)\s(\S*)\s.*/) + { + $one = substr $1, 1; + $two = substr $2, 1; + } + $variance_all_SNP{$pos_tmp} = $nucs_alt; + $variance_all{$pos_tmp."b"} = $chromosome."\t".$pos_tmp."\t".".\t".$ref_check[$position_tmp]."\t".$nucs_alt."\t.\t.\tAF=".$AF.";DP=".$DP; + $variance_all{$pos_tmp} = $chromosome."\t".$pos_tmp."\t".".\t".$ref_check[$position_tmp].$one."\t".$nucs_alt.$two."\t.\t.\tAF=".$AF.";DP=".$DP; + } + elsif (exists($variance_all{$pos_tmp}) && $hp_back eq "yes") + { + } + else + { + $variance_all_SNP{$pos_tmp} = $nucs_alt; + $variance_all{$pos_tmp} = $chromosome."\t".$pos_tmp."\t".".\t".$ref_check[$position_tmp]."\t".$nucs_alt."\t.\t.\tAF=".$AF.";DP=".$DP; + } + } + elsif ($ref_check[$position_tmp] ne $nuc_in_ext_hp && ($nuc_in_ext_hp eq "A" || $nuc_in_ext_hp eq "C" || $nuc_in_ext_hp eq "T" || $nuc_in_ext_hp eq "G")) + { + if (exists($linked_half_SNPs_exclude{$pos_tmp})) + { + } + else + { + my $pos_SNPs = -($position_back+$position_tmp+1); + $SNPs{$pos_SNPs} = $nuc_in_ext_hp; + } + $linked_SNPs{$pos_tmp} = undef + } + elsif ($ref_check[$position_tmp] ne $nuc_in_ext_hp && $no_variance ne "yes") + { + my $test_c = '0'; + foreach my $l_h_SNPs (keys %linked_half_SNPs) + { + $test_c++; + } + if ($test_c > 0) + { + $noback{$id} = "stop_HP"; + print OUTPUT5 "STOP_HALF_LINKED_BACK\n"; + } + $linked_half_SNPs{$pos_tmp} = undef; + if ($PCR_free ne "yes" && $test_c eq '0') + { + my $pos_SNPs = -($position_back+$position_tmp+1); + if ($nuc_other_than_ref ne "") + { + $SNPs{$pos_SNPs} = $nuc_other_than_ref; + } + substr $best_extension, $position_tmp, 1, $nuc_other_than_ref; + } + } + print OUTPUT5 $pos_tmp." POSI\n"; + } + + $next = '0'; + $var_SNP_detect = "yes"; + undef @pos; + } + elsif ($hp_seed_assemble ne "") + { + my $check = ""; + foreach my $pos_tmp (keys %variance_all_SNP) + { + if ($pos_tmp eq $prev_loc1-$d) + { + $not_linked_SNPs{$pos_tmp} = undef; + print OUTPUT5 $pos_tmp." NOT_LINKED\n"; + } + } + } + } + elsif ($ref_check[$th] eq ".") + { + } + elsif ($line[$d] eq ".") + { + } + elsif ($next < $max_SNP || $deletion_found eq "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 $ref_check[$th]." VAR_DETECT\n"; + } + + $pos = $prev_loc1+$th; + $next++; + my $count_pos_tmp = @pos; + push @pos, $prev_loc1-$th; + if ($pos_first eq "") + { + $pos_first = $d; + } + if ($d > length($best_extension_tmp)-6 && length($best_extension_tmp) > 6 && $count_pos_tmp eq '0') + { + $best_extension = substr $best_extension, 0, -5; + last VAR_CHECK_BACK; + } + if ($d eq '0') + { + $max_SNP = length($best_extension)/4; + if ($max_SNP < 3) + { + $max_SNP = 3; + } + if (length($best_extension) > 15 && $max_SNP < 6) + { + $max_SNP = 6; + } + if (length($best_extension) > 25 && $max_SNP < 7) + { + $max_SNP = 7; + } + } + } + elsif ($next > 0 && $pos_first ne '0') + { + $best_extension = substr $best_extension, 0, $pos_first; + $save_seq_ref = "no"; + if ($y > $startprint2) + { + } + last VAR_CHECK_BACK; + } + elsif ($next >= $max_SNP && $pos_first eq '0' && $deletion_found ne "yes") + { + my @check_deletion = check_deletion($best_extension_tmp, $ref_check_tmp,"","","yes",""); + my $var_deletion = $check_deletion[0]; + my $one_or_two = $check_deletion[1]; + my $shorter = $check_deletion[2]; + $save_seq_ref = "no"; + print OUTPUT5 $var_deletion." VAR_DEL_DETECT_BACK\n"; + + if ($var_deletion =~ m/(.*)\*(.*)?/) + { + my $deletion = $1; + my $after_deletion = $2; + $deletion =~ tr/\*//d; + my $last_nuc = substr $after_deletion, 0, 1; + my $loc_in_ref = $prev_loc1-$gh; + my $deletion_length0; + + if (exists($large_variance_back{$id})) + { + my $deletion_length = $large_variance_back{$id}-$prev_loc1; + my $deletion_length_tmp = length($deletion); + my $deleted_part = substr $read_short_start2, 0, $deletion_length; + $loc_in_ref = $large_variance_back{$id}; + + if (length($best_extension) < 13) + { + $deletion_length_tmp = $overlap - length($deletion); + } + if ($one_or_two eq "one") + { + $deletion_length += $deletion_length_tmp; + $deleted_part = reverse($deletion).$deleted_part; + $loc_in_ref -= 1; + } + else + { + $deletion_length -= $deletion_length_tmp; + substr $deleted_part, 0, $deletion_length_tmp, ""; + } + + if (length($best_extension) < 13) + { + if ($overlap > length($deletion)) + { + $deleted_part = substr $read_short_start2, $deletion_length, $deletion_length_tmp; + } + } + print OUTPUT5 $deletion_length." DELETION_LENGTH\n"; + print OUTPUT5 $deletion_length_tmp." DELETION_LENGTH2\n"; + print OUTPUT5 $large_variance_length_back{$id}." DELETION_LENGTH2b\n"; + + $deletion = reverse($deleted_part); + $last_nuc = substr $hashref2{$large_variance_back{$id}-1}, 0, 1; + $deletion_length0 = $deletion_length; + + if ($large_variance_length_back{$id} > $deletion_length_tmp) + { + if ($one_or_two eq "two") + { + } + $one_or_two = "one"; + } + elsif ($large_variance_length_back{$id} < $deletion_length_tmp) + { + $one_or_two = "two"; + } + print OUTPUT5 $deleted_part." LARGE_DELETION\n"; + } + + my $tmp = '0'; + my $check_rep = substr $hashref2{$loc_in_ref}.$hashref2{$loc_in_ref+30}, -$tmp, length($deletion); + + while ($check_rep eq $deletion && $deletion ne "" && $check_rep ne "") + { + $tmp++; + $check_rep = substr $hashref2{$loc_in_ref}.$hashref2{$loc_in_ref+30}, -$tmp, length($deletion); + } + if ($tmp ne '0') + { + $tmp += length($deletion); + } + + my $pos_tmp = $loc_in_ref-1; + my $last_nucb = $last_nuc; + my $tmpie = $loc_in_ref-$tmp-1; + if ($hp_seed_assemble eq "") + { + if (exists($variance_all{$pos_tmp}) && $hp_back ne "yes") + { + if ($variance_all{$pos_tmp} =~ m/^\S*\s\S*\s\S*\s(\S*)\s(\S*)\s.*/) + { + $last_nuc = $1; + $last_nucb = $2; + } + $variance_all{$pos_tmp."b"} = $variance_all{$pos_tmp}; + delete $variance_all{$pos_tmp}; + } + elsif (exists($variance_all{$pos_tmp}) && $hp_back eq "yes") + { + } + else + { + if ($one_or_two eq "one") + { + $variance_all{$pos_tmp} = $chromosome."\t".$pos_tmp."\t".".\t".$last_nuc."\t".$last_nucb.reverse($deletion)."\t.\t.\t."; + $variance_back{$id."+".$tmpie} = length($deletion); + } + elsif($one_or_two eq "two") + { + $variance_all{$pos_tmp} = $chromosome."\t".$pos_tmp."\t".".\t".$last_nuc.reverse($deletion)."\t".$last_nucb."\t.\t.\t."; + $variance_back{$id."+".$tmpie} = -length($deletion); + } + } + } + my $check_dot = $after_deletion =~ tr/\./\./; + if ($shorter > 0) + { + substr $best_extension, -$shorter, $shorter, ""; + } + + if ($deletion_length0 < $large_variance_length_back{$id} && exists($large_variance_back{$id})) + { + if ($one_or_two eq "one") + { + $best_extension_tmp = substr $read_short_start2, 0, $large_variance_length_back{$id}-$deletion_length0; + $best_extension_tmp = reverse($best_extension_tmp).$best_extension; + $prev_loc1 = $prev_loc1-($prev_loc1-$large_variance_back{$id}); + } + elsif($one_or_two eq "two") + { + $ref_check_tmp = substr $ref_check, length($deletion); + } + $deletion_found = "yes"; + delete $large_variance_back{$id}; + delete $large_variance_length_back{$id}; + goto VAR_START_BACK; + } + + delete $large_variance_back{$id}; + delete $large_variance_length_back{$id}; + + if ($check_dot > 0) + { + if ($one_or_two eq "one") + { + $best_extension_tmp = substr $best_extension, length($deletion); + } + elsif($one_or_two eq "two") + { + $ref_check_tmp = substr reverse($ref_check), length($deletion); + } + $deletion_found = "yes"; + goto VAR_START_BACK; + } + } + if ($var_deletion ne "") + { + last VAR_CHECK_BACK; + } + elsif ($max_SNP eq '1') + { + $max_SNP = length($best_extension)/5; + if ($max_SNP < 3) + { + $max_SNP = 3; + } + if (length($best_extension) > 15 && $max_SNP < 6) + { + $max_SNP = 6; + } + if (length($best_extension) > 25 && $max_SNP < 7) + { + $max_SNP = 7; + } + undef @pos; + goto VAR_START_BACK; + } + else + { + if (exists($large_variance_back{$id})) + {} + else + { + print OUTPUT5 $prev_loc1-$gh." LARGE_VARIANCE_BACK\n"; + + $large_variance_back{$id} = $prev_loc1; + } + last VAR_CHECK_BACK; + } + } + else + { + $gh++; + last VAR_CHECK_BACK; + } + $d++ + } + last VAR_CHECK_BACK; + } + + if ($best_extension ne "") + { + $indel_split = '0'; + delete $indel_split{$id}; + } + foreach my $var_pos_tmp (keys %variance_back) + { + my @split = split /\+/, $var_pos_tmp; + my $var_pos = $split[1]; + if ($split[0] eq $id) + { + if ($ref_id + $ref_loc - (length($last_seq_ref.$best_extension)) > $var_pos) + { + $save_seq_ref = "no"; + } + } + } + if ($save_seq_ref ne "no") + { + $last_ref_seq_back{$id} = $last_seq_ref; + $last_pos_seq_back{$id} = $prev_loc1_tmp; + } + else + { + delete $last_ref_seq_back{$id}; + } + $SNR_test = ""; + if ($deletion_back eq "yes" && $del_detect eq '1') + { + $del_detect = '2'; + goto DEL_DETECT_BACK; + } + if ($deletion_back eq "yes") + { + $best_extension = $best_extension_del; + } + $best_extension_back_prev{$id} = $best_extension; + goto AFTER_EXT_BACK; + } + else + { + delete $last_ref_seq_back{$id}; + } + + if (length($best_extension1_part) > 10) + { + my @ref1 = build_partialb_4dots $best_extension1_part; +EXT1_PART_BACK: foreach my $best_extension1_part2 (@ref1) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension1_part2/$best_extension1_part2/ || $ref_check_tmp2 =~ s/$best_extension1_part2/$best_extension1_part2/) + { + print OUTPUT5 "REFERENCE_GUIDED1\n"; + print OUTPUT5 $best_extension1_tmp." BEST_EXTENSION1\n\n"; + $best_extension = $best_extension1_tmp; + $reference_guided_back = "yes1"; + last EXT1_PART_BACK; + } + } + undef @ref1; + } + if ($reference_guided_back ne "yes1" && length($best_extension1_partb) > 10) + { + my @ref1b = build_partialb_4dots $best_extension1_partb; +EXT1_PARTB_BACK: foreach my $best_extension1_part2b (@ref1b) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension1_part2b/$best_extension1_part2b/ || $ref_check_tmp2 =~ s/$best_extension1_part2b/$best_extension1_part2b/) + { + print OUTPUT5 "REFERENCE_GUIDED1c\n"; + print OUTPUT5 $best_extension1_tmp." BEST_EXTENSION1\n\n"; + $best_extension = $best_extension1_tmp; + $reference_guided_back = "yes1"; + last EXT1_PARTB_BACK; + } + } + undef @ref1b; + } + if ($reference_guided_back ne "yes1") + { +EXT1_PART_single_BACK: foreach my $extensions_group1 (@extensions_group1) + { + if (length($extensions_group1) > 15) + { + my $extensions_group1_part = substr $extensions_group1, 0, 25; + my @ref1_single = build_partialb_4dots $extensions_group1_part; + foreach my $best_extension1_part2_single (@ref1_single) + { + my $ref_check_tmp = reverse($ref_check); + my $ref_check_tmp2 = reverse($ref_check_star); + if ($ref_check_tmp =~ s/$best_extension1_part2_single/$best_extension1_part2_single/ || $ref_check_tmp2 =~ s/$best_extension1_part2_single/$best_extension1_part2_single/) + { + print OUTPUT5 "REFERENCE_GUIDED1b\n"; + print OUTPUT5 $best_extension1_tmp." BEST_EXTENSION1_single\n\n"; + $best_extension = $best_extension1_tmp; + $reference_guided_back = "yes1"; + last EXT1_PART_single_BACK; + } + } + undef @ref1_single; + } + } + } + + if (length($best_extension2_part) > 10) + { + my @ref2 = build_partialb_4dots $best_extension2_part; +EXT2_PART_BACK: foreach my $best_extension2_part2 (@ref2) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension2_part2/$best_extension2_part2/ || $ref_check_tmp2 =~ s/$best_extension2_part2/$best_extension2_part2/) + { + $best_extension = $best_extension2_tmp; + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes2"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 $reference_guided_back." REFERENCE_GUIDED2\n"; + print OUTPUT5 $best_extension2_tmp." BEST_EXTENSION2\n\n"; + last EXT2_PART_BACK; + } + } + undef @ref2; + } + if ($reference_guided_back ne "yes2" && $reference_guided_back ne "yes_both" && length($best_extension2_partb) > 10) + { + my @ref2b = build_partialb_4dots $best_extension2_partb; +EXT2_PARTB_BACK: foreach my $best_extension2_part2b (@ref2b) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension2_part2b/$best_extension2_part2b/ || $ref_check_tmp2 =~ s/$best_extension2_part2b/$best_extension2_part2b/) + { + $best_extension = $best_extension2_tmp; + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes2"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 $reference_guided_back." REFERENCE_GUIDED\n"; + print OUTPUT5 $best_extension2_tmp." BEST_EXTENSION2\n\n"; + last EXT2_PARTB_BACK; + } + } + undef @ref2b; + } + if ($reference_guided_back ne "yes2" && $reference_guided_back ne "yes_both") + { +EXT2_PART_single_BACK: foreach my $extensions_group2 (@extensions_group2) + { + if (length($extensions_group2) > 15) + { + my $extensions_group2_part = substr $extensions_group2, 0, 25; + my @ref2_single = build_partialb_4dots $extensions_group2_part; + foreach my $best_extension2_part2_single (@ref2_single) + { + my $ref_check_tmp = reverse($ref_check); + my $ref_check_tmp2 = reverse($ref_check_star); + if ($ref_check_tmp =~ s/$best_extension2_part2_single/$best_extension2_part2_single/ || $ref_check_tmp2 =~ s/$best_extension2_part2_single/$best_extension2_part2_single/) + { + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes2"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED2b\n"; + print OUTPUT5 $best_extension2_tmp." BEST_EXTENSION2_single\n\n"; + $best_extension = $best_extension2_tmp; + last EXT2_PART_single_BACK; + } + } + undef @ref2_single; + } + } + } + + if ($count_split > 2 && $reference_guided_back ne "yes_both") + { + if (length($best_extension3_part) > 10) + { + my @ref3 = build_partialb_4dots $best_extension3_part; +EXT3_PART_BACK: foreach my $best_extension3_part2 (@ref3) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension3_part2/$best_extension3_part2/ || $ref_check_tmp2 =~ s/$best_extension3_part2/$best_extension3_part2/) + { + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes3"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED\n"; + print OUTPUT5 $best_extension3_tmp." BEST_EXTENSION3\n\n"; + $best_extension = $best_extension3_tmp; + last EXT3_PART_BACK; + } + } + undef @ref3; + } + if ($reference_guided_back ne "yes3" && $reference_guided_back ne "yes_both") + { +EXT3_PART_single_BACK: foreach my $extensions_group3 (@extensions_group3) + { + if (length($extensions_group3) > 15) + { + my $extensions_group3_part = substr $extensions_group3, 0, 25; + my @ref3_single = build_partialb_4dots $extensions_group3_part; + foreach my $best_extension3_part2_single (@ref3_single) + { + my $ref_check_tmp = reverse($ref_check); + my $ref_check_tmp2 = reverse($ref_check_star); + if ($ref_check_tmp =~ s/$best_extension3_part2_single/$best_extension3_part2_single/ || $ref_check_tmp2 =~ s/$best_extension3_part2_single/$best_extension3_part2_single/) + { + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes3"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED\n"; + print OUTPUT5 $best_extension3_tmp." BEST_EXTENSION3_single\n\n"; + $best_extension = $best_extension3_tmp; + last EXT3_PART_single_BACK; + } + } + undef @ref3_single; + } + } + } + if ($reference_guided_back ne "yes3" && $reference_guided_back ne "yes_both" && length($best_extension3_partb) > 10) + { + my @ref3b = build_partialb_4dots $best_extension3_partb; +EXT3_PARTB_BACK: foreach my $best_extension3_part2b (@ref3b) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension3_part2b/$best_extension3_part2b/ || $ref_check_tmp2 =~ s/$best_extension3_part2b/$best_extension3_part2b/) + { + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes3"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED\n"; + print OUTPUT5 $best_extension3_tmp." BEST_EXTENSION3\n\n"; + $best_extension = $best_extension3_tmp; + last EXT3_PARTB_BACK; + } + } + undef @ref3b; + } + } + if ($count_split > 3 && $reference_guided_back ne "yes_both") + { + if (length($best_extension4_part) > 10) + { + my @ref4 = build_partialb_4dots $best_extension4_part; +EXT4_PART_BACK: foreach my $best_extension4_part2 (@ref4) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension4_part2/$best_extension4_part2/ || $ref_check_tmp2 =~ s/$best_extension4_part2/$best_extension4_part2/) + { + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes4"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED\n"; + print OUTPUT5 $best_extension4_tmp." BEST_EXTENSION4\n\n"; + $best_extension = $best_extension4_tmp; + last EXT4_PART_BACK; + } + } + undef @ref4; + } + if ($reference_guided_back ne "yes4" && $reference_guided_back ne "yes_both") + { +EXT4_PART_single_BACK: foreach my $extensions_group4 (@extensions_group4) + { + if (length($extensions_group4) > 15) + { + my $extensions_group4_part = substr $extensions_group4, 0, 25; + my @ref4_single = build_partialb_4dots $extensions_group4_part; + foreach my $best_extension4_part2_single (@ref4_single) + { + my $ref_check_tmp = reverse($ref_check); + my $ref_check_tmp2 = reverse($ref_check_star); + if ($ref_check_tmp =~ s/$best_extension4_part2_single/$best_extension4_part2_single/ || $ref_check_tmp2 =~ s/$best_extension4_part2_single/$best_extension4_part2_single/) + { + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes4"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED\n"; + print OUTPUT5 $best_extension4_tmp." BEST_EXTENSION4_single\n\n"; + $best_extension = $best_extension4_tmp; + last EXT4_PART_single_BACK; + } + } + undef @ref4_single; + } + } + } + if ($reference_guided_back ne "yes4" && $reference_guided_back ne "yes_both" && length($best_extension4_partb) > 10) + { + my @ref4b = build_partialb_4dots $best_extension4_partb; +EXT4_PARTB_BACK: foreach my $best_extension4_part2b (@ref4b) + { + my $ref_check_tmp = $ref_check; + my $ref_check_tmp2 = $ref_check_star; + if ($ref_check_tmp =~ s/$best_extension4_part2b/$best_extension4_part2b/ || $ref_check_tmp2 =~ s/$best_extension4_part2b/$best_extension4_part2b/) + { + if ($reference_guided_back eq "") + { + $reference_guided_back = "yes4"; + } + else + { + $reference_guided_back = "yes_both"; + } + print OUTPUT5 "REFERENCE_GUIDED\n"; + print OUTPUT5 $best_extension4_tmp." BEST_EXTENSION4\n\n"; + $best_extension = $best_extension4_tmp; + last EXT4_PARTB_BACK; + } + } + undef @ref4b; + } + } + if ($reference_guided_back eq "yes_both") + { + $best_extension = ""; + $reference_guided_back = ""; + goto INDEL_BACKa + } + elsif ($reference_guided_back ne "") + { + $reference_guided_back = "yes"; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + goto AFTER_EXT_BACK; + } + else + { + $best_extension = ""; + } + } + } + last CHECK_REF_BACK; + } + elsif (length($read) > 1100 && $p > $p_prev+15) + { +CHECK_BACK_REF0_BACK: $p_prev = $p; + my $length_forward = '1000'; + if ($further eq "yes") + { + $length_forward = '5000'; + } + elsif ($further eq "yes2") + { + $length_forward = '10000'; + } + elsif ($further eq "yes3") + { + $length_forward = '20000'; + } + my $read_part_forward = substr $read, $length_forward+$p-150, 200; + my @ref_id3_new; + undef @ref_id3_new; + my $ref_part_tmp2; +CHECK_BACK_REF_BACK: foreach my $ref_id (@ref_id3) + { + $ref_part_tmp2 = $ref_id3{$ref_id}; + if (exists($hashref2{$ref_id-$length_forward})) + { + my $ref_part_forward = $hashref2{$ref_id-$length_forward}; + if ($y > $startprint2) + { + print OUTPUT5 $ref_part_forward." REF_PART_FORWARD\n"; + } + my @ref_part_forward = build_partialb_4dots $ref_part_forward; + foreach my $ref_part_forward_tmp (@ref_part_forward) + { + my $found = $read_part_forward =~ s/$ref_part_forward_tmp/$ref_part_forward_tmp/; + if ($found > 0) + { + push @ref_id3_new, $ref_id; + next CHECK_BACK_REF_BACK; + } + } + undef @ref_part_forward; + } + } + if (@ref_id3_new eq 1) + { + undef @ref_id3; + @ref_id3 = @ref_id3_new; + $found_further_back = "yes"; + $ref_part_prev = $ref_part_tmp2; + } + elsif ($further eq "" && length($read) > 5200+$p) + { + $further = "yes"; + goto CHECK_BACK_REF0_BACK; + } + elsif ($further eq "yes" && length($read) > 10200+$p) + { + $further = "yes2"; + goto CHECK_BACK_REF0_BACK; + } + elsif ($further eq "yes2" && length($read) > 20200+$p) + { + $further = "yes3"; + goto CHECK_BACK_REF0_BACK; + } + else + { + undef @ref_id3; + } + } + $p++; + } + if (($variance_detection eq "yes" || $heteroplasmy ne "") && $split eq "" && $repetitive_detect_back eq "") + { + if ($best_extension ne "") + { + $indel_split_back = '0'; + delete $indel_split_back{$id}; + } + $SNR_test = ""; + if ($y > $startprint2) + { + print OUTPUT5 reverse($best_extension)." BEST_EXTENSION_BACK\n\n"; + } + $best_extension_back_prev{$id} = $best_extension; + goto AFTER_EXT_BACK; + } + } +INDEL_BACKa: + if ($last_chance_back ne "yes" && (($SNP_active_back eq "" && $before_back eq "yes") || $extensions_before eq "yes") && $indel_split_skip_back ne "yes" && $ext_total_back > 15 && (($delete_first ne "yes_back" || $count_split ne "2") || (length($best_extension1) < 5 && length($best_extension2) < 5 && length($best_extension3) < 5 && length($best_extension4) < 5)) && $indel_split_back < ($read_length-25 -$overlap)) + { + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE SPLIT_BACK\n\n"; + } + } + elsif ((($before_back eq "yes" && $SNP_active_back eq "") || $extensions_before eq "yes") && (($delete_first ne "yes_back" || $count_split > 2) || (length($best_extension1) < 5 && length($best_extension2) < 5 && length($best_extension3) < 5 && length($best_extension4) < 5))) + { + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $SNP_active_back{$id_original} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "\nDELETE SPLIT_BACK2\n\n"; + } + } + elsif (($before_back eq "yes" || $extensions_before eq "yes") && (($delete_first ne "yes_back" || $count_split ne "2") || (length($best_extension1) < 5 && length($best_extension2) < 5 && length($best_extension3) < 5 && length($best_extension4) < 5))) + { + if ($y > $startprint2) + { + print OUTPUT5 $delete_first." DELETE_FIRST\n"; + print OUTPUT5 "STOP BACK\n"; + } + $read_new = $read_new1; + $best_extension = ""; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $seed{$id} = $read_new; + + delete $last_chance_back{$id}; + $noback = "stop"; + $noback{$id} = "stop"; + if ($noforward ne "stop") + { + goto SEED; + } + else + { + delete $seed{$id}; + $id = $id_original; + $split = ""; + goto AFTER_EXT_BACK; + } + } +INDEL_BACK: + $id = $id_original; + $split = ""; + $split_forward = "yes"; + + + if ($jump_rep_back eq "yes" || ($extensions_before ne "yes" && $indel_split_skip_back ne "yes" && ($delete_first eq "" || $delete_second eq "" || $delete_third eq "") && $before_back ne "yes" && $reference_guided_back ne "yes")) + { + my $before_shorter = ""; + my $overhangb = 1+($read_length/40); + my $count_one = '0'; + my $count_two = '0'; + while (($count_one < 3 || $count_two < 3) && $overhangb < 25) + { + $overhangb++; + $count_one = '0'; + $count_two = '0'; + foreach my $one (@extensions_group1) + { + if (length($one) < $overhangb && length($one) > 0) + { + $count_one++; + if (@extensions_group1 < 2) + { + $count_one++; + } + } + } + foreach my $two (@extensions_group2) + { + if (length($two) < $overhangb && length($two) > 0) + { + $count_two++; + if (@extensions_group2 < 2) + { + $count_two++; + } + } + } + } + my $overhang = sprintf("%.0f", $overhangb); + my $overhang_for_pairs = $overhang; + if ($overhang_for_pairs > 35 && $average_coverage_ext > 20) + { + $overhang = 35; + } + if ($overhang_for_pairs > 20 && $average_coverage_ext > 65) + { + $overhang = 20; + } + delete $jump_rep_back{$id}; +BEFORE_BACK: + my $s = '0'; + + if ($before_shorter eq "yes") + { + $overhang += 5; + } + $before_shorter = ""; + + print OUTPUT5 $overhang." OVERHANG_BACK\n"; + my @extensions_yuyu; + undef @extensions_yuyu; + my %extensions_yuyu; + my %before1F; + my %before2F; + my %before3F; + my %before4F; + my %before1B; + my %before2B; + my %before3B; + my %before4B; + undef %repetitive_pair; + + my $skip_overhang = ""; + if ($AT_rich_before_back eq "yes") + { + my $before_split_tmp = substr $read_short_start2, 0, $read_length-$overhang; + my $A_rich_test = $before_split_tmp =~ tr/AX\.//; + my $T_rich_test = $before_split_tmp =~ tr/TX\.//; + my $G_rich_test = $before_split_tmp =~ tr/GX\.//; + my $C_rich_test = $before_split_tmp =~ tr/CX\.//; + if ($A_rich_test > ($read_length-$overhang-3) || $T_rich_test > ($read_length-$overhang-3) || $G_rich_test > ($read_length-$overhang-3) || $C_rich_test > ($read_length-$overhang-3)) + { + $skip_overhang = "yes"; + $overhang -= 5; + } + } + $s = '0'; + my $before_split = substr $read_short_start2, $read_length-$left-1-$overhang-$overlap, $overhang+$overlap; + my $star0 = '0'; + my $star0b = '0'; + my $star0b2 = '0'; + if ($containX_short_start2 > 0) + { + my $before_split2 = substr $read_short_start2, 0, $read_length-$left-1-$overlap-$overhang; + my $before_split3 = substr $read_short_start2, $read_length-$left-1-$overlap-$overhang, $overhang+$overlap; + $star0 = $before_split2 =~ tr/\*//; + $star0b = $before_split3 =~ tr/\*//; + if ($star0 > 0 || $star0b > 0) + { + $before_split = substr $read_short_start2, $read_length-$left-1-$overhang-$overlap+($star0*2), $overhang+$overlap+($star0)+($star0b*2); + my $star0c = $before_split =~ tr/\*//; + while ($star0c > $star0b) + { + $before_split = substr $read_short_start2, $read_length-$left-1-$overhang-$overlap+($star0*2), $overhang+$overlap+($star0)+($star0b*2); + $star0b = $star0c; + $star0c = $before_split =~ tr/\*//; + } + } + } + + print OUTPUT5 $before_split." BEFORE_SPLIT\n"; + while ($s <= length($before_split)-$overlap-($star0b*2)) + { + my $line_tmp_reverse = substr $before_split, $s, $overlap; + + if ($star0 > 0 || $star0b > 0) + { + my $star = $line_tmp_reverse =~ tr/\*//; + $line_tmp_reverse = substr $before_split, $s, $overlap+($star*2); + my $star2 = $line_tmp_reverse =~ tr/\*//; + while ($star2 > $star) + { + $line_tmp_reverse = substr $before_split, $s, $overlap+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $line_tmp_reverse =~ tr/\*//; + } + my $before_split2_tmp = substr $read_short_start2, 0, $read_length-$left-1-$overhang-$overlap+($star0*2)+(length($before_split)-$s-$overlap); + $star0b2 = $before_split2_tmp =~ tr/\*//; + } + + my %line_tmp = build_partial3b $line_tmp_reverse, "reverse_back"; + foreach my $line_reverse (keys %line_tmp) + { + $line_reverse =~ tr/ACTG/TGAC/; + my $line = reverse($line_reverse); + if (exists($hash2b{$line})) + { + my $search = $hash2b{$line}; + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + my $found_new; + my $first_10 = substr $found, $read_length-1-$overhang+$s-10+$star0, 10; + my $first_10b = substr $found, $read_length-1-$overhang+$s-10, 10; + my $first_nuc; + $first_10 =~ tr/ACTG/TGAC/; + $first_10b =~ tr/ACTG/TGAC/; + my $first_10r = reverse($first_10); + my $first_10rb = reverse($first_10b); + my $first_10_read_start = substr $read_start, 0, 10; + my $check_first10 = $first_10r =~ s/$first_10_read_start/$first_10_read_start/; + my $check_first10b = $first_10rb =~ s/$first_10_read_start/$first_10_read_start/; + + if ($check_first10 > 0) + { + $found_new = substr $found, 0, $read_length-1-$overhang+$s+$star0,""; + $first_nuc = substr $found, 0, 1; + } + elsif ($check_first10b > 0) + { + $found_new = substr $found, 0, $read_length-1-$overhang+$s,""; + $first_nuc = substr $found, 0, 1; + } + if ($check_first10 > 0 || $check_first10b > 0) + { + $first_nuc =~ tr/ACTG/TGAC/; + $found_new =~ tr/ACTG/TGAC/; + $found =~ tr/ACTG/TGAC/; + my $first_nuc1 = substr $best_extension1, 0, 1; + my $first_nuc2 = substr $best_extension2, 0, 1; + my $first_nuc3 = substr $best_extension3, 0, 1; + my $first_nuc4 = substr $best_extension4, 0, 1; + + my $found_reverse = reverse($found_new); + my $extension_yuyu = $found; + $extensions_yuyu{$search} = $extension_yuyu; + + if ($first_nuc eq $first_nuc1 && ($check_first10 > 0 || $check_first10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + $before1B{$search} = $found_reverse; + } + elsif ($first_nuc eq $first_nuc2 && ($check_first10 > 0 || $check_first10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + $before2B{$search} = $found_reverse; + } + elsif ($first_nuc eq $first_nuc3 && ($check_first10 > 0 || $check_first10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + $before3B{$search} = $found_reverse; + } + elsif ($first_nuc eq $first_nuc3 && ($check_first10 > 0 || $check_first10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + $before4B{$search} = $found_reverse; + } + if ($save_reads ne "") + { + my $add_read = $search_tmp; + if (exists($save_reads{$add_read})) + { + } + else + { + $save_reads{$add_read} = undef; + if ($save_reads eq "2") + { + my $add_read2 = $map_ids{$add_read}; + print OUTPUT10 $add_read2."\/1\n"; + print OUTPUT11 $add_read2."\/2\n"; + } + else + { + print OUTPUT10 ">".$add_read."\/1\n"; + print OUTPUT11 ">".$add_read."\/2\n"; + } + if (exists($hash{$add_read})) + { + my @add_read = split /,/,$hash{$add_read}; + my $forward = $add_read[0]; + my $reverse = $add_read[1]; + if ($use_quality ne "") + { + $forward =~tr/1234/ACTG/; + $reverse =~tr/1234/ACTG/; + } + print OUTPUT10 $forward."\n"; + print OUTPUT11 $reverse."\n"; + } + } + } + } + } + } + } + } + $s++; + } + $s = '0'; + $before_split = substr $read_short_start2, $read_length-$right-1-$overhang-$overlap, $overhang+$overlap; + my $star3 = '0'; + if ($containX_short_start2 > 0) + { + my $before_split2 = substr $read_short_start2, 0, $read_length-$right-1; + $star3 = $before_split2 =~ tr/\*//; + if ($star0 > 0) + { + $before_split = substr $read_short_start2, $read_length-$right-1-$overhang-$overlap+($star3*2), $overhang+$overlap+($star3*2); + } + } + + while ($s <= length($before_split)-$overlap) + { + my $line_tmp = substr $before_split, $s, $overlap; + if ($star3 > 0) + { + my $star = $line_tmp =~ tr/\*//; + $line_tmp = substr $before_split, $s, $overlap+($star*2); + my $star2 = $line_tmp =~ tr/\*//; + while ($star2 > $star) + { + $line_tmp = substr $before_split, $s, $overlap+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $line_tmp =~ tr/\*//; + } + } + my %line_tmp = build_partial3b $line_tmp, "back"; + foreach my $line (keys %line_tmp) + { + if (exists($hash2c{$line})) + { + my $search = $hash2c{$line}; + $search = substr $search, 1; + my @search = split /,/,$search; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + my $found_new; + my $first_nuc; + my $first_10 = substr $found, $overhang-$s+1, 10; + my $first_10b = substr $found, $overhang-$s+1-$star3, 10; + + my $first_10_read_start = substr $read_start, 0, 10; + my $check_first10 = $first_10 =~ s/$first_10_read_start/$first_10_read_start/; + my $check_first10b = $first_10b =~ s/$first_10_read_start/$first_10_read_start/; + if ($check_first10 > 0) + { + $found_new = substr $found, $overhang-$s+1, 100000, ""; + $first_nuc = substr $found, -1, 1; + } + elsif ($check_first10b > 0) + { + $found_new = substr $found, $overhang-$s+1-$star3, 100000, ""; + $first_nuc = substr $found, -1, 1; + } + if ($check_first10 > 0 || $check_first10b > 0) + { + my $first_nuc1 = substr $best_extension1, 0, 1; + my $first_nuc2 = substr $best_extension2, 0, 1; + my $first_nuc3 = substr $best_extension3, 0, 1; + my $first_nuc4 = substr $best_extension4, 0, 1; + my $extension_yuyu = reverse($found); + $extensions_yuyu{$search} = $extension_yuyu; + + if ($first_nuc eq $first_nuc1 && ($check_first10 > 0 || $check_first10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + $filter_before1_pair{$search} = $found_new; + $before1F{$search} = $found_new; + } + elsif ($first_nuc eq $first_nuc2 && ($check_first10 > 0 || $check_first10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + $filter_before2_pair{$search} = $found_new; + $before2F{$search} = $found_new; + } + elsif ($first_nuc eq $first_nuc3 && ($check_first10 > 0 || $check_first10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + $filter_before3_pair{$search} = $found_new; + $before3F{$search} = $found_new; + } + elsif ($first_nuc eq $first_nuc4 && ($check_first10 > 0 || $check_first10b > 0) && $first_nuc ne "") + { + if ($y > $startprint2) + { + } + $filter_before4_pair{$search} = $found_new; + $before4F{$search} = $found_new; + } + if ($save_reads ne "") + { + my $add_read = $search_tmp; + if (exists($save_reads{$add_read})) + { + } + else + { + $save_reads{$add_read} = undef; + if ($save_reads eq "2") + { + my $add_read2 = $map_ids{$add_read}; + print OUTPUT10 $add_read2."\/1\n"; + print OUTPUT11 $add_read2."\/2\n"; + } + else + { + print OUTPUT10 ">".$add_read."\/1\n"; + print OUTPUT11 ">".$add_read."\/2\n"; + } + if (exists($hash{$add_read})) + { + my @add_read = split /,/,$hash{$add_read}; + my $forward = $add_read[0]; + my $reverse = $add_read[1]; + if ($use_quality ne "") + { + $forward =~tr/1234/ACTG/; + $reverse =~tr/1234/ACTG/; + } + print OUTPUT10 $forward."\n"; + print OUTPUT11 $reverse."\n"; + } + } + } + } + } + } + } + } + $s++; + } + my %before_all1 = (%before1B, %before1F); + my %before_all2 = (%before2B, %before2F); + my %before_all3 = (%before3B, %before3F); + my %before_all4 = (%before4B, %before4F); + + + my $start_short_tmp = substr $read_short_start2, 0, $read_length+20; + my $star_first = substr $start_short_tmp, 0, 1; + if ($star_first eq "*") + { + $start_short_tmp = substr $read_short_start2, 1, $read_length+20; + } +BEFORE_EXTRA_BACK: + if ($y > $startprint2) + { + print OUTPUT5 $start_short_tmp." START_SHORT_TMP\n\n"; + } + my $first_yuyu = ""; + my $second_yuyu = ""; + my $third_yuyu = ""; + my $fourth_yuyu = ""; + my $first_yuyu2 = '0'; + my $second_yuyu2 = '0'; + my $third_yuyu2 = '0'; + my $fourth_yuyu2 = '0'; + my @filter_dot_before1; + my @filter_dot_before2; + my @filter_dot_before3; + my @filter_dot_before4; + undef @filter_dot_before1; + undef @filter_dot_before2; + undef @filter_dot_before3; + undef @filter_dot_before4; + my @extensions_before; + my @extensions_before1; + my @extensions_before2; + my @extensions_before3; + my @extensions_before4; + undef @extensions_before; + undef @extensions_before1; + undef @extensions_before2; + undef @extensions_before3; + undef @extensions_before4; + + my $start_short_tmp_part = substr $start_short_tmp, 0, $read_length-$overhang-5-1; + + foreach my $search (keys %before_all2) + { + my $overhang_tmp = $overhang; + my $yuyu0 = $before_all2{$search}; + my $yuyu2 = $yuyu0; + if (length($yuyu2) >= $read_length-$overhang_tmp-5-1) + { + $yuyu2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $yuyu2_tmp = $yuyu2; + my $check_yuyuy2 = $yuyu2_tmp =~ s/$start_short_tmp_part/$start_short_tmp_part/; + + if ($check_yuyuy2 > 0) + { + $second_yuyu = "yes"; + $second_yuyu2++; + my $start_short_tmp_part = substr $start_short_tmp, 0, length($yuyu2); + my $start_short_tmp_part2; + my $star = $start_short_tmp_part =~ tr/\*//; + my $check_yuyuy2b2 = '0'; + if ($star > 0) + { + $start_short_tmp_part = substr $start_short_tmp, 0, length($yuyu2)+($star*2); + $start_short_tmp_part2 = substr $start_short_tmp, 0, length($yuyu2)+($star); + my $yuyu2_tmp2 = $yuyu2; + $check_yuyuy2b2 = $yuyu2_tmp2 =~ s/$start_short_tmp_part2/$start_short_tmp_part2/; + } + my $yuyu2_tmp3 = $yuyu2; + my $check_yuyuy2b = $yuyu2_tmp3 =~ s/$start_short_tmp_part/$start_short_tmp_part/; + if ($check_yuyuy2b > 0 || $check_yuyuy2b2 > 0) + { + push @filter_dot_before2, $yuyu0; + if (exists($extensions_yuyu{$search})) + { + if ($y > $startprint2) + { + print OUTPUT5 $yuyu0." FOUND2 "; + print OUTPUT5 $extensions_yuyu{$search}." EXTe2\n"; + } + push @extensions_before, $extensions_yuyu{$search}; + push @extensions_before2, $extensions_yuyu{$search}; + } + $filter_before2{$search} = undef; + } + } + } + } + foreach my $search (keys %before_all1) + { + my $overhang_tmp = $overhang; + my $yuyu0 = $before_all1{$search}; + my $yuyu1 = $yuyu0; + if (length($yuyu1) >= $read_length-$overhang_tmp-5-1) + { + $yuyu1 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $yuyu1_tmp = $yuyu1; + my $check_yuyuy = $yuyu1_tmp =~ s/$start_short_tmp_part/$start_short_tmp_part/; + + if ($check_yuyuy > 0) + { + $first_yuyu = "yes"; + $first_yuyu2++; + my $start_short_tmp_part = substr $start_short_tmp, 0, length($yuyu1)-1; + my $star = $start_short_tmp_part =~ tr/\*//; + my $check_yuyuy1b2 = '0'; + my $start_short_tmp_part2; + if ($star > 0) + { + $start_short_tmp_part = substr $start_short_tmp, 0, length($yuyu1)-1+($star*2); + $start_short_tmp_part2 = substr $start_short_tmp, 0, length($yuyu1)-1+($star); + my $yuyu1_tmp2 = $yuyu1; + $check_yuyuy1b2 = $yuyu1_tmp2 =~ s/$start_short_tmp_part2/$start_short_tmp_part2/; + } + my $yuyu1_tmp3 = $yuyu1; + my $check_yuyuy1b = $yuyu1_tmp3 =~ s/$start_short_tmp_part/$start_short_tmp_part/; + if ($check_yuyuy1b > 0 || $check_yuyuy1b2 > 0) + { + push @filter_dot_before1, $yuyu0; + if (exists($extensions_yuyu{$search})) + { + if ($y > $startprint2) + { + print OUTPUT5 $yuyu0." FOUND1 "; + print OUTPUT5 $extensions_yuyu{$search}." EXTe1\n"; + } + push @extensions_before, $extensions_yuyu{$search}; + push @extensions_before1, $extensions_yuyu{$search}; + } + $filter_before1{$search} = undef; + } + } + } + } + foreach my $search (keys %before_all3) + { + my $overhang_tmp = $overhang; + my $yuyu0 = $before_all3{$search}; + my $yuyu3 = $yuyu0; + if (length($yuyu3) >= $read_length-$overhang_tmp-5-1) + { + $yuyu3 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $yuyu3_tmp = $yuyu3; + my $check_yuyuy = $yuyu3_tmp =~ s/$start_short_tmp_part/$start_short_tmp_part/; + + if ($check_yuyuy > 0) + { + $third_yuyu = "yes"; + $third_yuyu2++; + my $start_short_tmp_part = substr $start_short_tmp, 0, length($yuyu3)-1; + my $star = $start_short_tmp_part =~ tr/\*//; + my $check_yuyuy3b2 = '0'; + my $start_short_tmp_part2; + if ($star > 0) + { + $start_short_tmp_part = substr $start_short_tmp, 0, length($yuyu3)-1+($star*2); + $start_short_tmp_part2 = substr $start_short_tmp, 0, length($yuyu3)-1+($star); + my $yuyu3_tmp2 = $yuyu3; + $check_yuyuy3b2 = $yuyu3_tmp2 =~ s/$start_short_tmp_part2/$start_short_tmp_part2/; + } + my $yuyu3_tmp3 = $yuyu3; + my $check_yuyuy3b = $yuyu3_tmp3 =~ s/$start_short_tmp_part/$start_short_tmp_part/; + if ($check_yuyuy3b > 0 || $check_yuyuy3b2 > 0) + { + push @filter_dot_before3, $yuyu0; + if (exists($extensions_yuyu{$search})) + { + if ($y > $startprint2) + { + print OUTPUT5 $yuyu0." FOUND3 "; + print OUTPUT5 $extensions_yuyu{$search}." EXTe3\n"; + } + push @extensions_before, $extensions_yuyu{$search}; + push @extensions_before3, $extensions_yuyu{$search}; + } + $filter_before3{$search} = undef; + } + } + } + } + foreach my $search (keys %before_all4) + { + my $yuyu0 = $before_all4{$search}; + my $yuyu4 = $yuyu0; + if (length($yuyu4) >= $read_length-$overhang-5-1) + { + $yuyu4 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $yuyu4_tmp = $yuyu4; + my $check_yuyuy = $yuyu4_tmp =~ s/$start_short_tmp_part/$start_short_tmp_part/; + + if ($check_yuyuy > 0) + { + $fourth_yuyu = "yes"; + $fourth_yuyu2++; + my $start_short_tmp_part = substr $start_short_tmp, 0, length($yuyu4)-1; + my $star = $start_short_tmp_part =~ tr/\*//; + my $check_yuyuy4b2 = '0'; + my $start_short_tmp_part2; + if ($star > 0) + { + $start_short_tmp_part = substr $start_short_tmp, 0, length($yuyu4)-1+($star*2); + $start_short_tmp_part2 = substr $start_short_tmp, 0, length($yuyu4)-1+($star); + my $yuyu4_tmp2 = $yuyu4; + $check_yuyuy4b2 = $yuyu4_tmp2 =~ s/$start_short_tmp_part2/$start_short_tmp_part2/; + } + my $yuyu4_tmp3 = $yuyu4; + my $check_yuyuy4b = $yuyu4_tmp3 =~ s/$start_short_tmp_part/$start_short_tmp_part/; + if ($check_yuyuy4b > 0 || $check_yuyuy4b2 > 0) + { + push @filter_dot_before4, $yuyu0; + if (exists($extensions_yuyu{$search})) + { + if ($y > $startprint2) + { + print OUTPUT5 $yuyu0." FOUND4 "; + print OUTPUT5 $extensions_yuyu{$search}." EXTe4\n"; + } + push @extensions_before, $extensions_yuyu{$search}; + push @extensions_before4, $extensions_yuyu{$search}; + } + $filter_before4{$search} = undef; + } + } + } + } + + if ($y > $startprint2) + { + print OUTPUT5 $first_yuyu." FIRST_YUYU\n"; + print OUTPUT5 $second_yuyu." SECOND_YUYU\n"; + print OUTPUT5 $third_yuyu." THIRD_YUYU\n"; + print OUTPUT5 $fourth_yuyu." FOURTH_YUYU\n"; + } + + my $correction = '0'; + my $count_all = '0'; + $count_all = (@filter_dot_before1) + (@filter_dot_before2) + (@filter_dot_before3) + (@filter_dot_before4); + my $start_short_tmp2 = substr $read_short_start2, 0, $read_length-$overhang-5-1; + my $check_dot = $start_short_tmp2 =~ tr/\./\./; + if ($check_dot eq "") + { + $check_dot = '0' + } + my $check_star = $start_short_tmp2 =~ tr/\*/\*/; + my $check_star2 = ""; + my $deletion = ""; +print OUTPUT5 $check_star." CHECK_STAR\n"; +print OUTPUT5 $count_all." COUNT_ALL\n"; + if ($count_all > 2 && ($check_dot > 0 || $check_star > 0) && $type ne "mito_plant" && $heteroplasmy eq "") + { + my @split_dot = split /\./, $start_short_tmp2; + my $size = '0'; + my $size2 = '1'; + my $length_total = '0'; + my @filter_dot_before_all; + undef @filter_dot_before_all; + @filter_dot_before_all = (@filter_dot_before1, @filter_dot_before2, @filter_dot_before3, @filter_dot_before4); + + if ($check_star > 0) + { + my @deletion = split //, $start_short_tmp2; + my $found_star = ""; + foreach my $nuc (@deletion) + { + if ($found_star eq "yes") + { + $found_star = ""; + $deletion .= $nuc; + } + if ($nuc eq "*") + { + $found_star = "yes"; + } + } + print OUTPUT5 $deletion." NUC0\n"; + } + foreach my $dot_split (@split_dot) + { + $size++; + } + foreach my $dot_split (@split_dot) + { + if ($size2 ne $size) + { + $length_total += length($dot_split); + my $A = '0'; + my $C = '0'; + my $T = '0'; + my $G = '0'; + my $check_star2_part = ""; + if ($check_star > 0) + { + $check_star2_part = substr $start_short_tmp2, 0, $length_total; + $check_star2 = $check_star2_part =~ tr/\*/\*/; + print OUTPUT5 $check_star2." CHECK_STAR2\n"; + } + foreach my $dot_before2 (@filter_dot_before_all) + { + my $dot = substr $dot_before2, $length_total, 1; + if ($check_star2 > 0) + { + print OUTPUT5 $deletion." DEL_DOT\n"; + my $check_star2_part_tmp1 = $check_star2_part; + $check_star2_part_tmp1 =~ tr/\*//d; + my $dot_before2_tmp = $dot_before2; + my $check_star1 = $dot_before2_tmp =~ s/$check_star2_part_tmp1/$check_star2_part_tmp1/; + if ($check_star1 > 0) + { + $dot = substr $dot_before2, $length_total-$check_star2, 1; + } + else + { + $dot = substr $dot_before2, $length_total-($check_star2*2), 1; + } + } + if ($dot eq "A") + { + $A++; + } + elsif ($dot eq "C") + { + $C++; + } + elsif ($dot eq "T") + { + $T++; + } + elsif ($dot eq "G") + { + $G++; + } + } + if (($A > 2 && ($C+$T+$G) eq 0) || ($A > ($C+$T+$G)*10 && $repetitive_detect eq "")) + { + substr $read_new1, $length_total, 1, "A"; + print OUTPUT5 "DOT AAAAAAAAAAAA\n"; + $correction++; + } + if (($C > 2 && ($A+$T+$G) eq 0) || ($C > ($A+$T+$G)*10 && $repetitive_detect eq "")) + { + substr $read_new1, $length_total, 1, "C"; + print OUTPUT5 "DOT CCCCCCCCCCCC\n"; + $correction++; + } + if (($T > 2 && ($C+$A+$G) eq 0) || ($T > ($C+$A+$G)*10 && $repetitive_detect eq "")) + { + substr $read_new1, $length_total, 1, "T"; + print OUTPUT5 "DOT TTTTTTTTTTTT\n"; + $correction++; + } + if (($G > 2 && ($C+$T+$A) eq 0) || ($G > ($C+$T+$A)*10 && $repetitive_detect eq "")) + { + substr $read_new1, $length_total, 1, "G"; + print OUTPUT5 "DOT GGGGGGGGGGGGG\n"; + $correction++; + } + $length_total += 1; + } + $size2++; + } + } + + if ($first_yuyu eq "yes" && $second_yuyu ne "yes" && $third_yuyu ne "yes" && $fourth_yuyu ne "yes" && $first_yuyu2 > 3 && $correction eq $check_dot) + { + if (@extensions_before > 3) + { + $ext_before = "yes"; + @extensions_before = @extensions_before1; + goto EXT_BEFORE_BACK; + } + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $best_extension = $best_extension1; + if ($y > $startprint2) + { + print OUTPUT5 "\nONLY FIRST\n\n"; + } + delete $SNP_active_back{$id}; + delete $before_back{$id}; + delete $before_shorter_skip_back{$id}; + goto AFTER_EXT_BACK; + } + elsif ($second_yuyu eq "yes" && $first_yuyu ne "yes" && $third_yuyu ne "yes" && $fourth_yuyu ne "yes" && $second_yuyu2 > 3 && $correction eq $check_dot) + { + if (@extensions_before > 3) + { + $ext_before = "yes"; + @extensions_before = @extensions_before2; + goto EXT_BEFORE_BACK; + } + delete $seed{$id}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nONLY SECOND\n\n"; + print OUTPUT5 $best_extension2." BEST_E2\n\n"; + } + delete $SNP_active_back{$id_split1}; + delete $before_back{$id_split1}; + delete $before_shorter_skip_back{$id_split1}; + $id = $id_split1; + if ($correction > 0) + { + delete $seed{$id_split1}; + $best_extension = $best_extension2; + goto AFTER_EXT_BACK; + } + else + { + $read_new = $seed{$id_split1}; + $best_extension = $best_extension2; + $ref_skip_before_back = "yes"; + goto FINISH; + } + } + elsif ($third_yuyu eq "yes" && $first_yuyu ne "yes" && $second_yuyu ne "yes" && $fourth_yuyu ne "yes" && $third_yuyu2 > 3 && $correction eq $check_dot) + { + if (@extensions_before > 3) + { + $ext_before = "yes"; + @extensions_before = @extensions_before3; + goto EXT_BEFORE_BACK; + } + delete $seed{$id}; + delete $seed{$id_split3}; + delete $seed{$id_split1}; + if ($y > $startprint2) + { + print OUTPUT5 "\nONLY THIRD\n\n"; + } + delete $SNP_active_back{$id_split2}; + delete $before_back{$id_split2}; + delete $before_shorter_skip_back{$id_split2}; + $id = $id_split2; + if ($correction > 0) + { + delete $seed{$id_split2}; + $best_extension = $best_extension3; + goto AFTER_EXT_BACK; + } + else + { + $read_new = $seed{$id_split2}; + $best_extension = $best_extension3; + $ref_skip_before_back = "yes"; + goto FINISH; + } + } + elsif ($fourth_yuyu eq "yes" && $first_yuyu ne "yes" && $third_yuyu ne "yes" && $second_yuyu ne "yes" && $fourth_yuyu2 > 3 && $correction eq $check_dot) + { + if (@extensions_before > 3) + { + $ext_before = "yes"; + @extensions_before = @extensions_before4; + goto EXT_BEFORE_BACK; + } + delete $seed{$id}; + delete $seed{$id_split2}; + delete $seed{$id_split1}; + if ($y > $startprint2) + { + print OUTPUT5 "\nONLY FOURTH\n\n"; + } + delete $SNP_active_back{$id_split3}; + delete $before_back{$id_split3}; + delete $before_shorter_skip_back{$id_split3}; + $id = $id_split3; + if ($correction > 0) + { + $best_extension = $best_extension4; + delete $seed{$id_split3}; + goto AFTER_EXT_BACK; + } + else + { + $read_new = $seed{$id_split3}; + $best_extension = $best_extension4; + $ref_skip_before_back = "yes"; + goto FINISH; + } + } + elsif($count_all < 3 && $overhang < $read_length-($overlap+15) && $skip_overhang ne "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nMAKE BEFORE SHORTER\n\n"; + } + $before_shorter = "yes"; + goto BEFORE_BACK; + } + else + { + my %count1234; + undef %count1234; + my %count1234b; + undef %count1234b; + my $count1 = '0'; + my $count2 = '0'; + my $count3 = '0'; + my $count4 = '0'; + my $count1_pair = '0'; + my $count2_pair = '0'; + my $count3_pair = '0'; + my $count4_pair = '0'; + foreach my $exb (keys %filter_before1) + { + my $search_tmp = substr $exb, 0, -1; + my $search_end = substr $exb, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + my $found2; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + $found2 = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + $found2 = $search_tmp[1]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + $found2 =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + $found2 = decrypt $found2; + } + $count1++; + my $found_reverse = reverse($found); + $found_reverse =~ tr/ACTG/TGAC/; + + if (exists($filter_before1{$exb})) + { + if (exists($filter_before1_pair{$exb})) + { + $filter_before1{$exb} = $found_reverse; + $count1_pair++; + if ($y > $startprint2) + { + print OUTPUT5 $found_reverse." 1\n"; + } + } + else + { + delete $filter_before1{$exb}; + } + } + } + } + foreach my $exb (keys %filter_before2) + { + my $search_tmp = substr $exb, 0, -1; + my $search_end = substr $exb, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + my $found2; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + $found2 = $search_tmp[0]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + $found2 = $search_tmp[1]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + $found2 =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + $found2 = decrypt $found2; + } + my $found_reverse = reverse($found); + $found_reverse =~ tr/ACTG/TGAC/; + $count2++; + if (exists($filter_before2{$exb})) + { + if (exists($filter_before2_pair{$exb})) + { + $filter_before2{$exb} = $found_reverse; + $count2_pair++; + if ($y > $startprint2) + { + print OUTPUT5 $found_reverse." 2\n"; + } + } + else + { + delete $filter_before2{$exb}; + } + } + } + } + foreach my $exb (keys %filter_before3) + { + my $search_tmp = substr $exb, 0, -1; + my $search_end = substr $exb, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + my $found_reverse = reverse($found); + $found_reverse =~ tr/ACTG/TGAC/; + $count3++; + if (exists($filter_before3{$exb})) + { + if (exists($filter_before3_pair{$exb})) + { + $filter_before3{$exb} = $found_reverse; + $count3_pair++; + if ($y > $startprint2) + { + print OUTPUT5 $found_reverse." 3\n"; + } + } + else + { + delete $filter_before3{$exb}; + } + } + } + } + foreach my $exb (keys %filter_before4) + { + my $search_tmp = substr $exb, 0, -1; + my $search_end = substr $exb, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + if ($search_end eq "1") + { + $found = $search_tmp[1]; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[0]; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + } + my $found_reverse = reverse($found); + $found_reverse =~ tr/ACTG/TGAC/; + $count4++; + if (exists($filter_before4{$exb})) + { + if (exists($filter_before4_pair{$exb})) + { + $filter_before4{$exb} = $found_reverse; + $count4_pair++; + if ($y > $startprint2) + { + print OUTPUT5 $found_reverse." 4\n"; + } + } + else + { + delete $filter_before4{$exb}; + } + } + } + } + if ($y > $startprint2) + { + print OUTPUT5 $count1." COUNT1\n"; + print OUTPUT5 $count2." COUNT2\n"; + print OUTPUT5 $count3." COUNT3\n"; + print OUTPUT5 $count4." COUNT4\n"; + } + $count1234{'1'} = $count1_pair; + $count1234{'2'} = $count2_pair; + $count1234{'3'} = $count3_pair; + $count1234{'4'} = $count4_pair; + + my $morethan3 = '0'; + my $difference = ""; + + foreach my $count1234 (keys %count1234) + { + if ($count1234{$count1234} > 3) + { + $morethan3++; + } + } + my $best_extension1_tmp = $best_extension1; + my $best_extension2_tmp = $best_extension2; + my $SNR_check1 = $best_extension1_tmp =~ s/AAAAAAAA|CCCCCCCC|GGGGGGGG|TTTTTTTT//; + my $SNR_check2 = $best_extension2_tmp =~ s/AAAAAAAA|CCCCCCCC|GGGGGGGG|TTTTTTTT//; + if ($repetitive_detect_back ne "") + { + $SNR_check1 = ""; + $SNR_check2 = ""; + } + my $GGGG2 = $best_extension2 =~ tr/G/G/; + my $TTTT2 = $best_extension2 =~ tr/T/T/; + my $CCCC2 = $best_extension2 =~ tr/C/C/; + my $AAAA2 = $best_extension2 =~ tr/A/A/; + if ((($GGGG2 eq length($best_extension2) || $TTTT2 eq length($best_extension2) || $CCCC2 eq length($best_extension2) || $AAAA2 eq length($best_extension2)) && length($best_extension2) > 2) || length($best_extension2)*0.58 < $AAAA2 || length($best_extension2)*0.58 < $CCCC2 || length($best_extension2)*0.58 < $TTTT2 || length($best_extension2)*0.58 < $GGGG2) + { + $SNR_check2 = '1'; + } + my $GGGG1 = $best_extension1 =~ tr/G/G/; + my $TTTT1 = $best_extension1 =~ tr/T/T/; + my $CCCC1 = $best_extension1 =~ tr/C/C/; + my $AAAA1 = $best_extension1 =~ tr/A/A/; + if ((($GGGG1 eq length($best_extension1) || $TTTT1 eq length($best_extension1) || $CCCC1 eq length($best_extension1) || $AAAA1 eq length($best_extension1)) && length($best_extension1) > 2) || length($best_extension1)*0.58 < $AAAA1 || length($best_extension1)*0.58 < $CCCC1 || length($best_extension1)*0.58 < $TTTT1 || length($best_extension1)*0.58 < $GGGG1) + { + $SNR_check1 = '1'; + } + my $h = '10'; + if ($type eq "mito_plant" && ($count1+$count2+$count3+$count4) > $average_coverage_ext*0.8) + { + $h = '20'; + } + if ($type eq "mito_plant" && ($count1+$count2+$count3+$count4) > $average_coverage_ext*4) + { + $h = ($count1+$count2+$count3+$count4)/($average_coverage_ext/6); + } + if ((($count1 > 4 && ($count2+$count3+$count4) eq 0) || ($count1 > ($count2+$count3+$count4)*$h && ($count2+$count3+$count4) ne 0)) && $repetitive_detect_back2 ne "yes" && $correction eq $check_dot) + { + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $best_extension = $best_extension1; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE9\n\n"; + } + delete $SNP_active_back{$id}; + delete $before_back{$id}; + delete $before_shorter_skip_back{$id}; + goto AFTER_EXT_BACK; + } + elsif ((($count2 > 4 && ($count1+$count3+$count4) eq 0) || ($count2 > ($count1+$count3+$count4)*$h && ($count1+$count3+$count4) ne 0)) && $repetitive_detect_back2 ne "yes" && $correction eq $check_dot) + { + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE10\n\n"; + } + delete $SNP_active_back{$id_split1}; + delete $before_back{$id_split1}; + delete $before_shorter_skip_back{$id_split1}; + $id = $id_split1; + + $best_extension = $best_extension2; + goto AFTER_EXT_BACK; + } + elsif ((($count3 > 4 && ($count1+$count2+$count4) eq 0) || ($count3 > ($count1+$count2+$count4)*$h && ($count1+$count2+$count4) ne 0)) && $repetitive_detect_back2 ne "yes" && $correction eq $check_dot) + { + delete $seed{$id}; + delete $seed{$id_split3}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE11\n\n"; + } + delete $SNP_active_back{$id_split2}; + delete $before_back{$id_split2}; + delete $before_shorter_skip_back{$id_split2}; + $id = $id_split2; + $best_extension = $best_extension3; + goto AFTER_EXT_BACK; + } + elsif ((($count4 > 4 && ($count1+$count3+$count2) eq 0) || ($count4 > ($count1+$count3+$count2)*$h && ($count1+$count3+$count2) ne 0)) && $repetitive_detect_back2 ne "yes" && $correction eq $check_dot) + { + if (@extensions_before > 3) + { + $ext_before = "yes"; + goto EXT_BEFORE_BACK; + } + delete $seed{$id}; + delete $seed{$id_split2}; + delete $seed{$id_split1}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE12\n\n"; + } + delete $SNP_active_back{$id_split3}; + delete $before_back{$id_split3}; + delete $before_shorter_skip_back{$id_split3}; + $id = $id_split3; + + $best_extension = $best_extension4; + goto AFTER_EXT_BACK; + } + elsif((($morethan3 eq 0 && $overhang < $read_length-($overlap*1.5)) || ($morethan3 > 0 && length($read) <= $insert_size && $overhang < 30)) && $skip_overhang ne "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nMAKE BEFORE SHORTER2\n\n"; + } + $before_shorter = "yes"; + goto BEFORE_BACK; + } + elsif ($overhang_for_pairs > $overhang && $average_coverage_ext < 80 && $skip_overhang eq "") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nINCREASE_OVERHANG_BACK\n\n"; + } + $overhang = $overhang_for_pairs; + goto BEFORE_BACK; + } + elsif ($morethan3 > 0 && length($read) > $insert_size && $last_chance_back ne "yes" && (($SNR_check1 eq "" && $SNR_check2 eq "") || $overhang < 30)) + { + undef @extensions_before; + undef @extensions_before1; + undef @extensions_before2; + undef @extensions_before3; + undef @extensions_before4; + my $count1b = '0'; + my $count2b = '0'; + my $count3b = '0'; + my $count4b = '0'; + + my $size = keys %read_short_start_tmp; + my $insert_range_before = 1.65; + if ($insert_range_shorter eq "yes") + { + $insert_range_before = 1.45; + } + + my $F = ($insert_size-$read_length) - ((($insert_size*$insert_range_before)-$insert_size)/2) - $overhang; + if ($F < 0) + { + $F = '0'; + } + my $read_short_start_tempie = substr $read, $F, ($insert_size*$insert_range_before)+$overhang; + + $read_short_start_tempie =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + undef %read_short_start_tmp; + + %read_short_start_tmp = build_partial3c $read_short_start_tempie; + if ($y > $startprint2) + { + print OUTPUT5 $read_short_start_tempie." READ_SHORT\n"; + } + + my $ff = '0'; + my %hash_read_short_start; + undef %hash_read_short_start; + foreach my $read_short_start_tempie (keys %read_short_start_tmp) + { + $ff = '0'; + while ($ff < (length($read_short_start_tempie)-$read_length+$left+$right+2)) + { + my $read_short_start_part = substr $read_short_start_tempie, $ff, $read_length-$left-$right-1; + my $star_start = substr $read_short_start_part, 0, 1; + if ($star_start eq "*") + { + $read_short_start_part = substr $read_short_start_tempie, $ff+1, $read_length-$left-$right-1; + } + + $hash_read_short_start{$read_short_start_part} = undef; + $ff++; + } + } + + foreach my $exb0 (keys %filter_before1) + { + my $exb = $filter_before1{$exb0}; + my $match_pair_middle = substr $exb, $right, $read_length-$left-$right; +FILTER_1_BACK: foreach my $line (keys %hash_read_short_start) + { + my $found_seq = '0'; + my $match_pair_middle_tmp = $match_pair_middle; + + $found_seq = $match_pair_middle_tmp =~ s/.$line/$line/; + if ($found_seq > 0) + { + if (exists($filter_before1_pair{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $filter_before1_pair{$exb0}." FOUND1\n"; + } + } + if (exists($extensions_yuyu{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $extensions_yuyu{$exb0}." EXT1\n"; + } + push @extensions_before, $extensions_yuyu{$exb0}; + push @extensions_before1, $extensions_yuyu{$exb0}; + } + $count1b++; + last FILTER_1_BACK; + } + } + } + foreach my $exb0 (keys %filter_before2) + { + my $exb = $filter_before2{$exb0}; + my $match_pair_middle = substr $exb, $right, $read_length-$left-$right; +FILTER_2_BACK: foreach my $line (keys %hash_read_short_start) + { + my $found_seq = '0'; + my $match_pair_middle_tmp = $match_pair_middle; + + $found_seq = $match_pair_middle_tmp =~ s/.$line/$line/; + if ($found_seq > 0) + { + if (exists($filter_before2_pair{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $filter_before2_pair{$exb0}." FOUND2\n"; + } + if (exists($extensions_yuyu{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $extensions_yuyu{$exb0}." EXT2\n"; + } + push @extensions_before, $extensions_yuyu{$exb0}; + push @extensions_before2, $extensions_yuyu{$exb0}; + } + } + $count2b++; + last FILTER_2_BACK; + } + } + } + foreach my $exb0 (keys %filter_before3) + { + my $exb = $filter_before3{$exb0}; + my $match_pair_middle = substr $exb, $right, $read_length-$left-$right; +FILTER_3_BACK: foreach my $line (keys %hash_read_short_start) + { + my $found_seq = '0'; + my $match_pair_middle_tmp = $match_pair_middle; + + $found_seq = $match_pair_middle_tmp =~ s/.$line/$line/; + if ($found_seq > 0) + { + if (exists($filter_before3_pair{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $filter_before3_pair{$exb0}." FOUND3\n"; + } + if (exists($extensions_yuyu{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $extensions_yuyu{$exb0}." EXT3\n"; + } + push @extensions_before, $extensions_yuyu{$exb0}; + push @extensions_before3, $extensions_yuyu{$exb0}; + } + } + $count3b++; + last FILTER_3_BACK; + } + } + } + foreach my $exb0 (keys %filter_before4) + { + my $exb = $filter_before4{$exb0}; + my $match_pair_middle = substr $exb, $right, $read_length-$left-$right; +FILTER_4_BACK: foreach my $line (keys %hash_read_short_start) + { + my $found_seq = '0'; + my $match_pair_middle_tmp = $match_pair_middle; + + $found_seq = $match_pair_middle_tmp =~ s/.$line/$line/; + if ($found_seq > 0) + { + if (exists($filter_before4_pair{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $filter_before4_pair{$exb0}." FOUND4\n"; + } + if (exists($extensions_yuyu{$exb0})) + { + if ($y > $startprint2) + { + print OUTPUT5 $extensions_yuyu{$exb0}." EXT4\n"; + } + push @extensions_before, $extensions_yuyu{$exb0}; + push @extensions_before4, $extensions_yuyu{$exb0}; + } + } + $count4b++; + last FILTER_4_BACK; + } + } + } + if ($y > $startprint2) + { + print OUTPUT5 $count1b." COUNT1B\n"; + print OUTPUT5 $count2b." COUNT2B\n"; + print OUTPUT5 $count3b." COUNT3B\n"; + print OUTPUT5 $count4b." COUNT4B\n"; + } + my $f = '7'; + if ($SNR_check1 ne "" || $SNR_check2 ne "") + { + $f = '8'; + } + if ($repetitive_detect_back ne "") + { + $f = '10'; + } + if ($type eq "mito_plant") + { + $f = '10'; + } + if ($type eq "mito_plant" && ($count1b+$count2b+$count3b+$count4b) > '25') + { + $f = '20'; + } + my $dup = ""; + my $r = '4'; + if ($type eq "mito_plant" && ($count1b+$count2b+$count3b+$count4b) > $average_coverage_ext*4) + { + $dup = "yes"; + $r = 6; + } + + $count1234b{'1'} = $count1b; + $count1234b{'2'} = $count2b; + $count1234b{'3'} = $count3b; + $count1234b{'4'} = $count4b; + + my $differenceb1 = ""; + my $differenceb2 = ""; + my $differenceb3 = ""; + my $differenceb4 = ""; + + foreach my $count1234b (keys %count1234b) + { + if ($count1234b ne '1' && $count1234b{$count1234b} > 0 && $count1b > $f*$count1234b{$count1234b}) + { + $differenceb1 = "yes"; + } + if ($count1234b ne '2' && $count1234b{$count1234b} > 0 && $count2b > $f*$count1234b{$count1234b}) + { + $differenceb2 = "yes"; + } + if ($count1234b ne '3' && $count1234b{$count1234b} > 0 && $count3b > $f*$count1234b{$count1234b}) + { + $differenceb3 = "yes"; + } + if ($count1234b ne '4' && $count1234b{$count1234b} > 0 && $count4b > $f*$count1234b{$count1234b}) + { + $differenceb4 = "yes"; + } + } + + if (($count1b > 2 && ($count2b+$count3b+$count4b) eq '0') || ($differenceb1 eq "yes" && $differenceb2 eq "" && $differenceb3 eq "" && $differenceb4 eq "") && ($dup ne "yes" || ($count2b+$count3b+$count4b) < $average_coverage_ext/$r)) + { + if (@extensions_before > 15 && $repetitive_detect_back2 eq "") + { + $ext_before = "yes"; + @extensions_before = @extensions_before1; + goto EXT_BEFORE_BACK; + } + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $best_extension = $best_extension1; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE5\n\n"; + } + delete $SNP_active_back{$id}; + delete $before_back{$id}; + delete $before_shorter_skip_back{$id}; + goto AFTER_EXT_BACK; + } + elsif (($count2b > 2 && ($count1b+$count3b+$count4b) eq '0') || ($differenceb2 eq "yes" && $differenceb1 eq "" && $differenceb3 eq "" && $differenceb4 eq "") && ($dup ne "yes" || ($count1b+$count3b+$count4b) < $average_coverage_ext/$r)) + { + if (@extensions_before > 15 && $repetitive_detect_back2 eq "") + { + $ext_before = "yes"; + @extensions_before = @extensions_before2; + goto EXT_BEFORE_BACK; + } + delete $seed{$id}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE6\n\n"; + } + delete $SNP_active_back{$id_split1}; + delete $before_back{$id_split1}; + delete $before_shorter_skip_back{$id_split1}; + $id = $id_split1; + if ($correction > 0) + { + delete $seed{$id_split1}; + $best_extension = $best_extension2; + goto AFTER_EXT_BACK; + } + else + { + $read_new = $seed{$id_split1}; + goto FINISH; + } + } + elsif (($count3b > 2 && ($count1b+$count2b+$count4b) eq '0') || ($differenceb3 eq "yes" && $differenceb2 eq "" && $differenceb1 eq "" && $differenceb4 eq "") && ($dup ne "yes" || ($count2b+$count1b+$count4b) < $average_coverage_ext/$r)) + { + if (@extensions_before > 15 && $repetitive_detect_back2 eq "") + { + $ext_before = "yes"; + @extensions_before = @extensions_before3; + goto EXT_BEFORE_BACK; + } + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split3}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE7\n\n"; + } + delete $SNP_active_back{$id_split2}; + delete $before_back{$id_split2}; + delete $before_shorter_skip_back{$id_split2}; + $id = $id_split2; + if ($correction > 0) + { + delete $seed{$id_split2}; + $best_extension = $best_extension3; + goto AFTER_EXT_BACK; + } + else + { + $read_new = $seed{$id_split2}; + goto FINISH; + } + } + elsif (($count4b > 2 && ($count1b+$count2b+$count3b) eq '0') || ($differenceb4 eq "yes" && $differenceb2 eq "" && $differenceb3 eq "" && $differenceb1 eq "") && ($dup ne "yes" || ($count2b+$count3b+$count1b) < $average_coverage_ext/$r)) + { + if (@extensions_before > 15 && $repetitive_detect_back2 eq "") + { + $ext_before = "yes"; + @extensions_before = @extensions_before4; + goto EXT_BEFORE_BACK; + } + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + if ($y > $startprint2) + { + print OUTPUT5 "\nREVERSE_DELETE SPLIT_BEFORE8\n\n"; + } + delete $SNP_active_back{$id_split3}; + delete $before_back{$id_split3}; + delete $before_shorter_skip_back{$id_split3}; + $id = $id_split3; + if ($correction > 0) + { + delete $seed{$id_split3}; + $best_extension = $best_extension4; + goto AFTER_EXT_BACK; + } + else + { + $read_new = $seed{$id_split3}; + goto FINISH; + } + } + elsif(($count1b+$count2b+$count3b+$count4b) < 10 && $overhang < $read_length-($overlap*1.5) && $before_shorter_skip_back ne "yes" && $skip_overhang ne "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nMAKE BEFORE SHORTER3\n\n"; + } + $before_shorter = "yes"; + goto BEFORE_BACK; + } + elsif(($count1b+$count2b+$count3b+$count4b) <= 10 && $overhang < $read_length-($overlap*1.5) && $before_shorter_skip_back ne "yes" && $insert_range_before eq "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nMAKE BEFORE SHORTER3\n\n"; + } + $insert_range_shorter = ""; + goto BEFORE_BACK; + } + elsif(($count1b+$count2b+$count3b+$count4b) > 10 && $before_shorter_skip_back ne "yes" && $insert_range_shorter eq "") + { + if ($y > $startprint2) + { + print OUTPUT5 "\nSMALLER INSERT_BACK\n\n"; + } + $insert_range_shorter = "yes"; + goto BEFORE_BACK; + } + elsif (($count1b+$count2b+$count3b+$count4b) > 9 && $extensions_before ne "yes" && $repetitive_detect_back2 ne "yes" && $before_shorter_skip_back ne "yes" && $heteroplasmy eq "") + { + $l = 0; + $best_extension = ""; + $SNP = ""; + @extensions = @extensions_before; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + $extensions_before = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "\nEXTENSIONS FROM BEFORE BACK\n\n"; + } + $best_extension_old1 = $best_extension1; + $best_extension_old2 = $best_extension2; + $best_extension_old3 = $best_extension3; + $best_extension_old4 = $best_extension4; + $SNP_active_back = "yes"; + delete $before_back{$id}; + $before_shorter_skip_back{$id} = "yes"; + delete $seed{$id}; + delete $seed{$id_original}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + undef @extensions_group1; + undef @extensions_group2; + undef @extensions_group3; + undef @extensions_group4; + goto NUCLEO_BACK; + } + } +EXT_BEFORE_BACK: if ($ext_before eq "yes") + { + $l = 0; + $best_extension = ""; + $SNP = ""; + @extensions = @extensions_before; + $ext = '0'; + foreach (@extensions) + { + $ext++; + } + $extensions_before = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "\nEXTENSIONS FROM BEFORE BACK2\n\n"; + } + $best_extension_old1 = $best_extension1; + $best_extension_old2 = $best_extension2; + $best_extension_old3 = $best_extension3; + $best_extension_old4 = $best_extension4; + $SNP_active_back = "yes"; + delete $before_back{$id}; + $before_shorter_skip_back{$id} = "yes"; + delete $seed{$id}; + delete $seed{$id_original}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + undef @extensions_group1; + undef @extensions_group2; + undef @extensions_group3; + undef @extensions_group4; + goto NUCLEO_BACK; + } + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + $before_back{$id} = "yes"; + $best_extension = ""; + $read_new = $read_new1; + $before_shorter_skip_back{$id} = "yes"; + if ($before_shorter_skip_back eq "yes") + { + $before_back{$id} = "yes"; + } + + if ($count_split > 2) + { + my $tmp = '0'; + if ($first_yuyu ne "yes") + { + $yuyu_option_back{$id.$firstSNP_max[$tmp]} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $firstSNP_max[$tmp]." FIRST_YUYU\n"; + } + } + $tmp++; + if ($second_yuyu ne "yes") + { + $yuyu_option_back{$id.$firstSNP_max[$tmp]} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $firstSNP_max[$tmp]." SECOND_YUYU\n"; + } + } + $tmp++; + if ($third_yuyu ne "yes") + { + $yuyu_option_back{$id.$firstSNP_max[$tmp]} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $firstSNP_max[$tmp]." THIRD_YUYU\n"; + } + } + $tmp++; + if ($fourth_yuyu ne "yes") + { + $yuyu_option_back{$id.$firstSNP_max[$tmp]} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 $firstSNP_max[$tmp]." FOURTH_YUYU\n"; + } + } + } + goto FINISH; + } + } + print OUTPUT5 $delete_first." DELETE_FIRST\n"; + if (($extensions_before eq "yes" || ($indel_split_skip_back ne "yes" && ($delete_first ne "yes_back" || (length($best_extension1) < 5 && length($best_extension2) < 5)) && $ext_total_back > 18 && $indel_split_back < ($read_length-25 -$overlap) && $SNR_read_back eq "")) && $deletion_back ne "yes") + { + $best_extension = ""; + $indel_split_back{$id} = $indel_split_back+10; + if ($y > $startprint2) + { + print OUTPUT5 "INCREASE_INDEL_SPLIT_BACK\n"; + } + goto FINISH; + } + elsif ($SNP_active_back eq "" && ($delete_first ne "yes_back" || (length($best_extension1) < 5 && length($best_extension2) < 5)) && $deletion_back ne "yes") + { + $best_extension = ""; + if ($y > $startprint2) + { + print OUTPUT5 "SNP_ACTIVE_BACK\n"; + } + delete $indel_split_back{$id}; + $SNP_active_back{$id} = "yes"; + goto FINISH; + } + elsif ($deletion_back eq "yes") + { + if ($y > $startprint2) + { + print OUTPUT5 "DELETION_DETECT\n"; + } + delete $seed{$id_split1}; + delete $seed{$id}; + delete $before_back{$id}; + delete $before_shorter_skip_back{$id}; + $read_new = $read_new1; + } + else + { + if ($y > $startprint2) + { + print OUTPUT5 "ELSE\n"; + } + delete $seed{$id_split1}; + delete $seed{$id}; + $indel_split_back = '0'; + delete $indel_split_back{$id}; + $noback{$id} = "stop"; + $read_new = $read_new1; + } + } + else + { + if ($best_extension ne "") + { + $indel_split_back = '0'; + delete $indel_split_back{$id}; + } + + my @ext = split //, $best_extension; + my $u = length($best_extension); + my $v = '1'; + + if (($SNR_test eq "yes2_back22222" || $SNR_test eq "yes2_double_back") && $u > 2) + { + my @ext3 = split //, $best_extension; + $v = '1'; + $u = length($best_extension); + while ($ext3[$u-$v-1] eq $ext3[$u-$v] && ($u-$v-1) > 1) + { + chop($best_extension); + $v++; + } + my $SNR_max = '0'; + my $SNR_min = '1000'; + my $n = '0'; + my $extensions_after_SNR = substr $best_extension, 0,4; + $extensions_after_SNR =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + foreach my $SNR_ext (keys %SNR_count_back) + { + my $SNR_count_back = $SNR_count_back{$SNR_ext}; + my $checkie = substr $SNR_ext, 0, 4; + if ($SNR_count_back ne "" && $checkie eq $extensions_after_SNR) + { + if ($SNR_count_back > $SNR_max) + { + $SNR_max = $SNR_count_back; + } + if ($SNR_count_back < $SNR_min) + { + $SNR_min = $SNR_count_back; + print $SNR_ext." MIN\n"; + } + } + } + if ($SNR_min eq '1000') + { + $SNR_min = '0'; + } + + my $p = '0'; + my $ut = '0'; + if ($SNR_test eq "yes2_back" && $SNR_max > 0) + { + $best_extension = $SNR_nucleo.$best_extension; + $ut = '1'; + while ($p < $SNR_max - $SNR_min) + { + $best_extension = "X".$best_extension; + $p++; + } + } + elsif ($SNR_test eq "yes2_double_back" && $SNR_max > 0) + { + $best_extension = $SNR_nucleo.$best_extension; + $ut = '2'; + while ($p < ($SNR_max - $SNR_min)/2) + { + $best_extension = "X2".$best_extension; + $p++; + } + } + $p = '0'; + + while ($p+$ut < $SNR_min) + { + $best_extension = $SNR_nucleo.$best_extension; + if ($SNR_test eq "yes2_back") + { + $p++; + } + if ($SNR_test eq "yes2_double_back") + { + $p++; + $p++; + } + } + delete $SNR_back{$id}; + } + elsif ($SNR_test eq "yes2_back2222" || $SNR_test eq "yes2_double_back") + { + my $ee2 = '10000'; + foreach (keys %SNR_count_back) + { + my $ee = $SNR_count_back{$_}; + if ($ee < $ee2) + { + $ee2 = $ee; + } + } + $best_extension = ""; + my $ee3 = '0'; + if ($ee2 ne '10000') + { + while ($ee3 < $ee2) + { + $best_extension = $SNR_nucleo.$best_extension; + $ee3++; + } + } + else + { + $best_extension = ""; + } + } + + $SNR_test = ""; + if ($y > $startprint2) + { + print OUTPUT5 reverse($best_extension)." BEST_EXTENSION_BACK\n\n"; + } + $best_extension_back_prev{$id} = $best_extension; + + + } +AFTER_EXT_BACK: + undef @matches; + undef @matches1; + undef @matches2; + chomp $best_extension; + + my $vk2 = '0'; + + if ($SNR_read_back eq "") + { + my @dot2 = split //, $best_extension; + my $ut2 = length($best_extension); + + while ($dot2[$ut2-1] eq "." || $dot2[$ut2-1] eq "*") + { + if ($dot2[$ut2-1] eq "*") + { + chop $best_extension; + chop $best_extension; + $vk2++; + $vk2++; + $ut2--; + $ut2--; + } + else + { + chop $best_extension; + $vk2++; + $ut2--; + } + } + + } + + if ($noback ne "stop" && ($best_extension ne "" && $best_extension ne " ")) + { + $read_new = reverse($best_extension).$read_new1; + $position_back += length($best_extension); + $position_back -= $vk2; + + my $position_back_tmp = $position_back{$id}; + delete $position_back{$id}; + $position_back{$id} = $position_back; + $position_back = $position_back_tmp; + if ($best_extension ne "" && $split_forward eq "") + { + delete $indel_split_skip_back{$id}; + } + + if (exists($last_ref_seq_back{$id}) && $split eq "") + { + my $seq_tmp = $last_ref_seq_back{$id}; + $last_ref_seq_back{$id} = reverse($best_extension).$seq_tmp; + } + if (exists($large_variance_back{$id}) && $split eq "") + { + $large_variance_length_back{$id} = $large_variance_length_back{$id}+length($best_extension); + } + + $best_extension = ""; + delete $regex_back{$id}; + delete $last_chance_back{$id}; + if ($split_forward eq "") + { + delete $before_back{$id}; + delete $before_shorter_skip_back{$id}; + } + + $id_test = $id; + + if ($split eq "") + { + delete $yuyu_option_back{$id.'A'}; + delete $yuyu_option_back{$id.'C'}; + delete $yuyu_option_back{$id.'T'}; + delete $yuyu_option_back{$id.'G'}; + delete $before_shorter_skip_back{$id}; + } + if ($split eq "") + { + foreach my $add_read2 (keys %extensions) + { + my $add_read = substr $extensions{$add_read2}, 0, -1; + $count_reads{$add_read} = undef; + } + } + } + elsif ($use_regex_back ne "yes" && $noback ne "stop" && $indel_split_back > 0) + { + delete $regex_back{$id}; + if ($split_forward eq "") + { + $indel_split_skip_back{$id} = "yes"; + $indel_split_back = '0'; + delete $indel_split_back{$id}; + } + elsif ($split_forward ne "" && exists($SNP_active_back{$id})) + { + $indel_split_skip_back{$id} = "yes"; + $indel_split_back = '0'; + delete $indel_split_back{$id}; + } + elsif ($split_forward ne "") + { + $SNP_active_back{$id} = "yes"; + } + $read_new = $read_new1; + if ($y > $startprint2) + { + print OUTPUT5 "2B\n"; + } + } + elsif ($use_regex_back ne "yes" && $noback ne "stop" && $repetitive_detect_back eq "") + { + delete $regex_back{$id}; + $regex_back{$id} = "yes"; + $read_new = $read_new1; + if ($y > $startprint2) + { + print OUTPUT5 "3B\n"; + } + } + elsif ($use_regex_back ne "yes" && $last_chance_back ne "yes" && $indel_split_back > 0 && $noback ne "stop") + { + $read_new = $read_new1; + $indel_split_skip_back{$id} = "yes"; + $indel_split_back = '0'; + delete $indel_split_back{$id}; + $SNP_active_back{$id} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "4B\n"; + } + } + elsif (($use_regex_back eq "yes" || $repetitive_detect_back ne "") && $last_chance_back ne "yes" && $noback ne "stop" && $AT_rich_before_back eq "") + { + $read_new = $read_new1; + delete $last_chance_back{$id}; + $last_chance_back{$id} = "yes"; + delete $regex_back{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "5B\n"; + } + } + elsif ($last_chance_back eq "yesssssssssssssssssssssssssssssssssss" && $SNR_read_back ne "" && $noback ne "stop") + { + $read_new = $read_new1; + delete $last_chance_back{$id}; + $last_chance_back{$id} = "yes"; + $last_chance = ""; + if ($y > $startprint2) + { + print OUTPUT5 "6B\n"; + } + } + elsif ($last_chance_back eq "yes" && $noback ne "stop" && $use_regex_back ne "yes") + { + $read_new = $read_new1; + delete $last_chance_back{$id}; + $last_chance_back{$id} = "yes"; + delete $regex_back{$id}; + $regex_back{$id} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "7B\n"; + } + } + elsif (length($read) < $insert_size+100 && $hp_seed_assemble ne "" && $best_extension_forward ne "") + { + delete $last_chance_back{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "FINISH_HP_BACK\n"; + } + } + elsif (($last_chance_back eq "yes" || $AT_rich_before_back ne "") && $noforward ne "stop" && $noback ne "stop" && $use_regex_back eq "yes") + { + $read_new = $read_new1; + delete $last_chance_back{$id}; + $noback = "stop"; + $noback{$id} = "stop"; + delete $regex_back{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "8B\n"; + } + } + elsif (exists($old_id{$id}) && $noforward eq "stop") + { + $merge_now = "yes2"; + if ($y > $startprint2) + { + print OUTPUT5 "8B_MERGE\n"; + } + goto MERGE; + } + elsif (exists($seed_split{$id})) + { + delete $seed{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "9B\n"; + } + my $old_rep_old_check = ""; + if (exists ($old_rep_old{$id})) + { + $old_rep_old_check = "yes"; + } + if (length($read) > 250 && $old_rep_old_check eq "") + { + if ($id =~ m/(.*_|)(\d+)$/) + { + my $check_id = $2; + if ($contig_num eq '1') + { + $contigs{$contig_num."+".$check_id} = $read; + my $start_point = '500'; + my $check_repetitive = '3'; + + while ($check_repetitive > 2) + { + my $check_for_rep = substr $read, 0, $start_point+200; + my $repetitive = substr $check_for_rep, $start_point, 15; + $check_repetitive = $check_for_rep =~ s/$repetitive/$repetitive/g; + { + $start_point += 20; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_IN_START_SEQUENCE\n"; + } + } + } + $first_contig_start = substr $read, $start_point, $overlap; + my $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 10; + $first_contig_start = substr $read, $start_point, $overlap; + $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + if ($y > $startprint2) + { + print OUTPUT5 $first_contig_start." CONTIG_START5\n"; + } + foreach my $seedie (keys %seed) + { + my $seedie_part = substr $seed{$seedie}, 0, 1000; + my $check_seedie = $seedie_part =~ s/$first_contig_start/$first_contig_start/; + if ($check_seedie > 0) + { + delete $seed{$seedie}; + } + } + } + else + { + $contigs{$contig_num."+".$check_id} = $read; + } + $contig_num++; + $seed_old{$id} = $read; + } + } +SKIP1: if ($y > $startprint2) + { + print OUTPUT5 ">".$id."\n"; + print OUTPUT5 $read."\n"; + } + goto SEED; + } + else + { + $noback = "stop"; + $noback{$id} = "stop"; + print OUTPUT5 "10B\n"; + } + if ($split eq "yes2_back") + { + $seed{$id} = $read_new; + $insert_size2{$id} = $insert_size; + $read_new = $read_new1; + + if ($count_split eq '2') + { + $split = "yes4_back"; + } + goto SPLIT_BACK; + } + elsif ($split eq "yes3_back") + { + $seed{$id} = $read_new; + $insert_size2{$id} = $insert_size; + $read_new = $read_new1; + + if ($count_split eq '3') + { + $split = "yes4_back"; + } + goto SPLIT_BACK; + } + if ($split eq "yes4_back") + { + $seed{$id} = $read_new; + $insert_size2{$id} = $insert_size; + $read_new = $read_new1; + + goto SPLIT_BACK; + } + } + elsif ($last_chance_back ne "yes" && $noback ne "stop" && $AT_rich_before_back eq "") + { + $read_new = $read_new1; + delete $last_chance_back{$id}; + $last_chance_back{$id} = "yes"; + delete $regex_back{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "5Bb\n"; + } + } +FINISH: + if ($ref_skip_before_back eq "yes") + { + if (exists($last_ref_seq_back{$id}) && $split eq "") + { + my $seq_tmp = $last_ref_seq_back{$id}; + $last_ref_seq_back{$id} = reverse($best_extension).$seq_tmp; + } + if (exists($large_variance_back{$id}) && $split eq "") + { + $large_variance_length_back{$id} = $large_variance_length_back{$id}+length($best_extension); + } + } + + my $count_seed = '0'; + foreach my $count_seed2 (keys %seed) + { + $count_seed++; + } + if ($y > $startprint2) + { + if ($noback ne "") + { + print OUTPUT5 $noback." NOBACK\n"; + } + if ($noforward ne "") + { + print OUTPUT5 $noforward." NOFORWARD\n"; + } + if ($last_chance ne "") + { + print OUTPUT5 $last_chance." LASTCHANCE\n"; + } + if ($use_regex_back ne "") + { + print OUTPUT5 $use_regex_back." REGEX_BACK\n"; + } + if ($last_chance_back ne "") + { + print OUTPUT5 $last_chance_back." LASTCHANCE_BACK\n"; + } + print OUTPUT5 $count_seed." COUNTSEED\n"; + } + if ($delete_first eq "yes2" && $indel_split > 0 && $SNR_next_seed ne "yes" && $reference_next_seed ne "yes") + { + $indel_split = '0'; + $indel_split_skip = "yes"; + delete $indel_split{$id}; + $noforward{$id} = ""; + $seed{$id} = $read_new; + if ($y > $startprint2) + { + print OUTPUT5 "FINISH1\n"; + } + } + elsif ($merge ne "yes" && $reference_next_seed ne "yes" && $circle eq "" && (($noback ne "stop" || ($noforward ne "stop" && $delete_first ne "yes2"))) && ($AT_rich ne "yes" || $noback ne "stop") && $bad_read eq "") + { + delete $seed{$id}; + $seed{$id} = $read_new; + if ($y > $startprint2) + { + print OUTPUT5 "FINISH2\n"; + } + } + elsif ($merge ne "yes" && $reference_next_seed ne "yes" && $circle eq "" && $AT_rich ne "yes" && $bad_read eq "" && $delete_first eq "yes2" && $last_chance ne "yes" && $SNR_next_seed ne "yes") + { + delete $seed{$id}; + $seed{$id} = $read_new; + delete $last_chance{$id}; + $last_chance{$id} = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "FINISH3\n"; + } + } + elsif ($position < $insert_size && $hp_seed_assemble eq "yes") + { + delete $seed{$id}; + $seed{$id} = $read_new; + delete $last_chance{$id}; + $last_chance{$id} = "yes"; + if ($noback_HP ne "yes") + { + delete $noback{$id}; + } + $hp_seed_assemble = "yes2"; + if ($y > $startprint2) + { + print OUTPUT5 "FINISH_HP\n"; + } + } + elsif ($hp_seed_assemble ne "" && $hp_seed_assemble_last_chance eq "" && ($noback_HP eq "" || $noforward eq "")) + { + delete $seed{$id}; + $seed{$id} = $read_new; + delete $last_chance{$id}; + delete $last_chance_back{$id}; + if ($noback_HP ne "yes") + { + delete $noback{$id}; + } + if ($noforward_HP ne "yes") + { + delete $noforward{$id}; + } + $hp_seed_assemble_last_chance = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 "FINISH_HP_LAST\n"; + } + } + elsif ($no_next_seed ne "yes" && $contig_num eq '1' && exists($old_id{$id}) && (($SNR_next_seed eq "yes" && ($last_chance_back eq "yes" || $noback eq "stop")) || ($nosecond eq "" && $CP_check ne "yes" && length($read) > $read_length+150 && $circle eq "" && (($last_chance eq "yes" || $noforward eq "stop") && ($last_chance_back eq "yes" || $noback eq "stop")) || ($AT_rich eq "yes" && $count_seed ne "0") || ($bad_read ne "" && $count_seed ne "0")))) + { + my $start_point = '500'; + my $check_repetitive = '3'; + + while ($check_repetitive > 2) + { + my $check_for_rep = substr $seed_old{$old_id{$id}}, 0, $start_point+200; + my $repetitive = substr $check_for_rep, $start_point, 15; + $check_repetitive = $check_for_rep =~ s/$repetitive/$repetitive/g; + if ($check_repetitive > 2) + { + $start_point += 20; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_IN_START_SEQUENCE\n"; + } + } + } + $first_contig_start = substr $seed_old{$old_id{$id}}, $start_point, $overlap; + my $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 10; + $first_contig_start = substr $seed_old{$old_id{$id}}, $start_point, $overlap; + $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + if ($y > $startprint2) + { + print OUTPUT5 $first_contig_start." CONTIG_START6a\n"; + } + + foreach my $seedie (keys %seed) + { + my $seedie_part = substr $seed{$seedie}, 0, 1000; + my $check_seedie = $seedie_part =~ s/$first_contig_start/$first_contig_start/; + if ($check_seedie > 0) + { + delete $seed{$seedie}; + } + } + $merge_now = "yes2"; + if ($y > $startprint2) + { + print OUTPUT5 "MERGE NOW\n"; + } + goto MERGE; + } + + elsif ($heteroplasmy eq "" && $no_next_seed ne "yes" && (($SNR_next_seed eq "yes" && ($last_chance_back eq "yes" || $noback eq "stop")) || ($nosecond eq "" && $CP_check ne "yes" && length($read) > $read_length+150 && $circle eq "" && (($last_chance eq "yes" || $noforward eq "stop") && ($last_chance_back eq "yes" || $noback eq "stop")) || ($AT_rich eq "yes" && $count_seed ne "0") || ($bad_read ne "" && $count_seed ne "0")))) + { + if ($id_original eq "") + { + $id_original = $id; + } + if ($bad_read ne "") + { + $read = $seed_old{$id_old}; + } + else + { + if ($y > $startprint2) + { + print OUTPUT5 ">".$id."\n"; + print OUTPUT5 $read."\n"; + } + my $lastbit_contig = substr $read, -20; + $lastbit_contig =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + + if (length($read) > $read_length+70 && ($lastbit_contig_prev !~ m/.*.$lastbit_contig.*/ || length($read) > $read_length+300)) + { + print OUTPUT6 ">".$id."\n"; + print OUTPUT6 $read."\n"; + $lastbit_contig_prev = substr $read, -100; + $lastbit_contig_prev =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + } + + if (exists $old_id{$id}) + { + if (exists($contigs_id{$old_id{$id}}) && length($seed_old{$old_id{$id}}) > $read_length+30) + { + } + else + { + print OUTPUT5 $seed_old{$old_id{$id}}." SEED_OLD_CONTIG\n"; + if ($contig_num eq '1') + { + $contigs{$contig_num."+".$id} = $seed_old{$old_id{$id}}; + my $start_point = '500'; + my $check_repetitive = '3'; + + while ($check_repetitive > 2) + { + my $check_for_rep = substr $seed_old{$old_id{$id}}, 0, $start_point+200; + my $repetitive = substr $check_for_rep, $start_point, 15; + $check_repetitive = $check_for_rep =~ s/$repetitive/$repetitive/g; + if ($check_repetitive > 2) + { + $start_point += 20; + if ($y > $startprint2) + { + print OUTPUT5 "DETECT_REPETITIVE_IN_START_SEQUENCE\n"; + } + } + } + $first_contig_start = substr $seed_old{$old_id{$id}}, $start_point, $overlap; + my $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 10; + $first_contig_start = substr $seed_old{$old_id{$id}}, $start_point, $overlap; + $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + if ($y > $startprint2) + { + print OUTPUT5 $first_contig_start." CONTIG_START6\n"; + } + + foreach my $seedie (keys %seed) + { + my $seedie_part = substr $seed{$seedie}, 0, 1000; + my $check_seedie = $seedie_part =~ s/$first_contig_start/$first_contig_start/; + if ($check_seedie > 0) + { + delete $seed{$seedie}; + } + } + } + else + { + $contigs{$contig_num."+".$id} = $seed_old{$old_id{$id}}; + } + $contig_num++; + } + delete $old_id{$id}; + } + $seed_old{$id} = $read; + $id_old = $id; + } + + delete $seed{$id}; + if (exists($seed_old{$id})) + { + $seed_old{$id} = $read; + } + + my $SNR_skip = "yes"; + my $xy = -($overlap+3); + my $tt = '-200'; + my $second_seed = ""; + my $u = '9'; + my $skip; + while ($SNR_skip eq "yes") + { + my $new_seed_part_tmp = substr $read, -$u, 9; + $new_seed_part_tmp =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\./\./; + my $SNR_checkA = $new_seed_part_tmp =~ tr/A\./A\./; + my $SNR_checkC = $new_seed_part_tmp =~ tr/C\./C\./; + my $SNR_checkT = $new_seed_part_tmp =~ tr/T\./T\./; + my $SNR_checkG = $new_seed_part_tmp =~ tr/G\./G\./; + + if ($SNR_checkA > 4 || $SNR_checkC > 4 || $SNR_checkG > 4 || $SNR_checkT > 4) + { + $SNR_skip = "yes"; + $u += 9; + } + elsif ($skip eq "") + { + $skip = "yes"; + $SNR_skip = "yes"; + $u += 9; + } + else + { + $SNR_skip = ""; + } + } + my $middle_next = '0'; + $u -= 9; + +NEXT1: + if ($y > $startprint2) + { + print OUTPUT5 $u." U\n"; + } + my %extensions_yuyu_next; + my %before_pair; + my $overhang = 40; + my $before_split = substr $read_short_end2, -($read_length-$left-1)-$u-$overhang, $overhang+$overlap+$u; + my $star3 = '0'; + my $next_seed; + if ($containX_short_end2 > 0) + { + my $before_split2 = substr $read_short_end2, -($read_length-$left-1)-$u-$overhang, $overhang+$overlap+$u; + $star3 = $before_split2 =~ tr/\*/\*/; + if ($star3 > 0) + { + $before_split = substr $read_short_end2, -($read_length-$left-1+($star3*2))-$u-$overhang, $overhang+$overlap+$u+($star3*2); + } + } + + if ($reference_next_seed eq "yes" && $next_seed_ref ne "") + { + $before_split = $next_seed_ref; + } + + my $s = '0'; + while ($s <= length($before_split)-$overlap) + { + my $line_tmp = substr $before_split, $s, $overlap; + if ($star3 > 0) + { + my $star = $line_tmp =~ tr/\*/\*/; + $line_tmp = substr $before_split, $s, $overlap+($star*2); + my $star2 = $line_tmp =~ tr/\*/\*/; + while ($star2 > $star) + { + $line_tmp = substr $before_split, $s, $overlap+($star*2)+(($star2-$star)*2); + $star = $star2; + $star2 = $line_tmp =~ tr/\*/\*/; + } + } + my %line_tmp = build_partial3b $line_tmp, ""; + foreach my $line (keys %line_tmp) + { + if (exists($hash2b{$line})) + { + my $search0 = $hash2b{$line}; + my $search_rev; + $search0 = substr $search0, 1; + my @search = split /,/,$search0; + + foreach my $search (@search) + { + my $search_tmp = substr $search, 0, -1; + my $search_end = substr $search, -1; + if (exists($hash{$search_tmp})) + { + my @search_tmp = split /,/,$hash{$search_tmp}; + my $found; + my $found_rev; + if ($search_end eq "1") + { + $found = $search_tmp[0]; + $found_rev = $search_tmp[1]; + $search_rev = $search_tmp."2"; + } + elsif ($search_end eq "2") + { + $found = $search_tmp[1]; + $found_rev = $search_tmp[0]; + $search_rev = $search_tmp."1"; + } + if ($use_quality ne "") + { + $found =~ tr/1234/ACTG/; + $found_rev =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $found = decrypt $found; + $found_rev = decrypt $found_rev; + } + my $found_new; + my $first_nuc; + + my $middle = substr $found, $left, -(15+$middle_next); + my $middle_read_end = substr $read_short_end2, -($read_length-$left-1)+$s-$u-$overhang+2, length($middle)-4; + if ($reference_next_seed eq "yes" && $next_seed_ref ne "") + { + $middle_read_end = $middle; + $middle = $next_seed_ref; + } + my $check_middle = $middle =~ s/(.)$middle_read_end/$1$middle_read_end/; + + + if ($check_middle > 0) + { + $found_rev =~ tr/ACTG/TGAC/; + my $found_rev2 = reverse($found_rev); + if ($reference_next_seed eq "yes" && $next_seed_ref ne "") + { + $found_rev2 = $found; + } + $before_pair{$found_rev2} = $search_rev; + } + if ($save_reads ne "") + { + my $add_read = $search_tmp; + if (exists($save_reads{$add_read})) + { + } + else + { + $save_reads{$add_read} = undef; + if ($save_reads eq "2") + { + my $add_read2 = $map_ids{$add_read}; + print OUTPUT10 $add_read2."\/1\n"; + print OUTPUT11 $add_read2."\/2\n"; + } + else + { + print OUTPUT10 ">".$add_read."\/1\n"; + print OUTPUT11 ">".$add_read."\/2\n"; + } + if (exists($hash{$add_read})) + { + my @add_read = split /,/,$hash{$add_read}; + my $forward = $add_read[0]; + my $reverse = $add_read[1]; + if ($use_quality ne "") + { + $forward =~tr/1234/ACTG/; + $reverse =~tr/1234/ACTG/; + } + print OUTPUT10 $forward."\n"; + print OUTPUT11 $reverse."\n"; + } + } + } + } + } + } + } + $s++; + } + if (%before_pair < 1 && $middle_next eq '0') + { + $middle_next = $read_length/4; + goto NEXT1; + } + elsif (%before_pair < 1 && $middle_next < ($read_length/4)+3) + { + $middle_next = ($read_length/4)+6; + $u += 20; + goto NEXT1; + } + elsif (%before_pair < 1 && $reference ne "" && $reference_next_seed eq "") + { + $reference_next_seed{$id} = "yes"; + delete $noforward{$id}; + $seed_id = $id; + $seed{$id} = $read; + goto SEED; + } + + my $id_tmp; + my %before_pair_final; + foreach my $before_pair (keys %before_pair) + { + $id_tmp = $before_pair{$before_pair}; + my $next_seed0 = correct ($before_pair, \%before_pair, $heteroplasmy); + my $check_dot = $next_seed0 =~ tr/\./\./; + + my $read_end_test2 = substr $read_short_end2, -30, 13; + my $read_end_test3 = substr $read_short_end2, -60, 13; + my $read_end_test4 = substr $read_short_end2, -100, 13; + my $read_end_test5 = substr $read_short_end2, -150, 13; + my $read_end_test6 = substr $read_short_end2, -200, 13; + + my $next_seed_tmp = substr $next_seed0, 25; + my $check_test2 = $next_seed_tmp =~ s/(.)$read_end_test2/$1.$read_end_test2/; + my $check_test3 = $next_seed_tmp =~ s/(.)$read_end_test3/$1.$read_end_test3/; + my $check_test4 = $next_seed_tmp =~ s/(.)$read_end_test4/$1.$read_end_test4/; + my $check_test5 = $next_seed_tmp =~ s/(.)$read_end_test5/$1.$read_end_test5/; + my $check_test6 = $next_seed_tmp =~ s/(.)$read_end_test6/$1.$read_end_test6/; + + if (($check_dot < (3 +($read_length/60)) || $check_dot eq "") && $next_seed0 ne "" && $check_test2 eq "" && $check_test3 eq "" && $check_test4 eq "" && $check_test5 eq "" && $check_test6 eq "") + { + print OUTPUT5 $next_seed0." FOUND_SEED\n"; + $before_pair_final{$next_seed0} = $id_tmp; + } + } + my $count_match = '0'; + my $count_match_tmp = '0'; + foreach my $before_pair_final (keys %before_pair_final) + { + my $pair_test1 = substr $before_pair_final, 17, 30; + my $pair_test2 = substr $before_pair_final, -47, 30; + foreach my $before_pair_final_tmp (keys %before_pair_final) + { + my $check_test1 = $before_pair_final_tmp =~ s/(.)$pair_test1/$1.$pair_test1/; + my $check_test2 = $before_pair_final_tmp =~ s/(.)$pair_test2/$1.$pair_test2/; + if ($check_test1 > 0 || $check_test2 > 0) + { + $count_match_tmp++; + } + } + if ($count_match_tmp > $count_match) + { + $count_match = $count_match_tmp; + $next_seed = $before_pair_final; + $id_tmp = $before_pair_final{$before_pair_final}; + } + $count_match_tmp = '0'; + $id = $before_pair_final{$before_pair_final}; + } + + if ($next_seed eq "" && $middle_next eq '0') + { + $middle_next = $read_length/4; + goto NEXT1; + } + elsif ($next_seed eq "" && $middle_next < ($read_length/4)+3) + { + $middle_next = ($read_length/4)+6; + $u += 20; + goto NEXT1; + } + elsif ($next_seed eq "" && $reference ne "" && $reference_next_seed eq "") + { + $reference_next_seed{$id} = "yes"; + delete $noforward{$id}; + $seed_id = $id; + $seed{$id} = $read; + goto SEED; + } + + if ($y > $startprint2) + { + print OUTPUT5 $next_seed." NEXT_SEED\n"; + } + + if ($next_seed ne "") + { + delete $seed{$id_original}; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + + $id = $id_tmp; + + + $seed{$id} = $next_seed; + $seed_id = $id; + $seeds_check{$id} = undef; + $position{$id} = length($next_seed); + $old_id{$id} = $id_old; + $old_id2{$id} = undef; + + $insert_size2{$id} = $insert_size; + + my $count_contig_tmp = $contig_count; + while ($count_contig_tmp > 0) + { + $contig_gap_min{$id."_".$count_contig_tmp} = $contig_gap_min{$id_old."_".$count_contig_tmp}; + $contig_gap_max{$id."_".$count_contig_tmp} = $contig_gap_max{$id_old."_".$count_contig_tmp}; + $count_contig_tmp--; + } + + $contig_count++; + $contig_count{$id} = $contig_count; + goto ITERATION; + } + + +NEXT_SEED: while ($xy > $tt) + { + my $new_seed_part = substr $read, $xy-$u+10, $overlap; + + if (exists($hash2c{$new_seed_part})) + { + my $id_tmp = $hash2c{$new_seed_part}; + my $id_tmp2 = substr $id_tmp, 1; + my @id_tmp = split /,/,$id_tmp2; + + foreach my $id_tmp3 (@id_tmp) + { + chomp ($id_tmp3); + my $id_match_end = substr $id_tmp3, -1, 1,"",; + my $seed_tmp; + + if (exists($hash{$id_tmp3}) && $id_tmp3) + { + my @id_tmp3 = split /,/, $hash{$id_tmp3}; + if ($id_match_end eq "1") + { + $seed_tmp = $id_tmp3[1]; + } + elsif ($id_match_end eq "2") + { + $seed_tmp = $id_tmp3[0]; + } + if ($use_quality ne "") + { + $seed_tmp =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $seed_tmp = decrypt $seed_tmp; + } + + $seed_tmp =~ tr/ACTG/TGAC/; + my $seed_tmp2 = reverse($seed_tmp); + my $yx = '0'; + while ($yx < length($seed_tmp2)-($overlap+1)) + { + my $seed_tmp2_part = substr $seed_tmp2, $yx, $overlap; + if (exists($hash2b{$seed_tmp2_part})) + { + my $id2 = $hash2b{$seed_tmp2_part}; + my $id5 = substr $id2, 1; + my @id_tmp2 = split /,/,$id5; + my $id_b = $id_tmp2[0]; + my $id_tmp2_end = substr $id_b, -1, 1,"",; + + if ($second_seed eq "ddd") + { + my @id_b = split /,/, $hash{$id_b}; + if ($id_tmp2_end eq "1") + { + $read = $id_b[0]; + } + elsif ($id_tmp2_end eq "2") + { + $read = $id_b[1]; + } + if ($use_quality ne "") + { + $read =~ tr/1234/ACTG/; + } + $second_seed = "yes"; + if ($encrypt eq "yes") + { + $read = decrypt $read; + } + + $tt = -length($read); + goto NEXT_SEED; + } + if ($y > $startprint2) + { + print OUTPUT5 $id_b." ID_B!!!!\n"; + } + if (exists($id_bad{$id_b})) + { + goto SAME_ID; + } + elsif (exists($hash{$id_b})) + { + my @id_b = split /,/, $hash{$id_b}; + if ($id_tmp2_end eq "1") + { + $seed = $id_b[0]; + } + elsif ($id_tmp2_end eq "2") + { + $seed = $id_b[1]; + } + if ($use_quality ne "") + { + $seed =~ tr/1234/ACTG/; + } + if ($encrypt eq "yes") + { + $seed = decrypt $seed; + } + my $part2 = substr $seed, -$overlap-15; + my $s = '0'; + my $most_match_total = '0'; + while ($s < length($part2)-$overlap) + { + my $part2b = substr $part2, $s, $overlap; + my $part2b_reverse = reverse($part2b); + $part2b_reverse =~ tr/ACTG/TGAC/; + if (exists($hash2b{$part2b})) + { + $most_match_total++; + } + if (exists($hash2c{$part2b})) + { + $most_match_total++; + } + if (exists($hash2b{$part2b_reverse})) + { + $most_match_total++; + } + if (exists($hash2c{$part2b_reverse})) + { + $most_match_total++; + } + $s++; + } + if ($most_match_total > 2) + { + my %empty_hash; + $empty_hash{$seed} = undef; + $seed = correct ($seed, \%empty_hash, $heteroplasmy); + my $f = '0'; + my $read_part = substr $read, -$insert_size*$insert_range; + while ($f < length($seed)-30) + { + my $middle1 = substr $seed, $f, 30; + my $check_read1 = $read_part =~ s/$middle1/$middle1/; + if ($check_read1 > 0) + { + $xy--; + $id_bad{$id_b} = undef; + goto NEXT_SEED; + } + $f += 10; + } + + my $seed_test = substr $seed, -$overlap-30; + my $A_rich_test3 = $seed_test =~ tr/A/A/; + my $T_rich_test3 = $seed_test =~ tr/T/T/; + my $G_rich_test3 = $seed_test =~ tr/G/G/; + my $C_rich_test3 = $seed_test =~ tr/C/C/; + my $dot_rich_test3 = $seed_test =~ tr/\./\./; + if ($A_rich_test3+$dot_rich_test3 > length($seed_test)-5 || $T_rich_test3+$dot_rich_test3 > length($seed_test)-5 || $G_rich_test3+$dot_rich_test3 > length($seed_test)-5 || $C_rich_test3+$dot_rich_test3 > length($seed_test)-5) + { + $xy--; + $id_bad{$id_b} = undef; + goto NEXT_SEED; + } + } + else + { + $xy--; + $id_bad{$id_b} = undef; + goto NEXT_SEED; + } + } + else + { + $id_bad{$id_b} = undef; + goto SAME_ID; + } + if ($y > $startprint2) + { + print OUTPUT5 $id_b." SECOND!!!!\n"; + print OUTPUT5 $seed." SEED!!!!\n"; + } + delete $seed{$id_original}; + delete $seed{$id}; + delete $seed{$id_split1}; + delete $seed{$id_split2}; + delete $seed{$id_split3}; + + + $id_bad{$id_b} = undef; + + $seed{$id_b} = $seed; + $seeds_check{$id_b} = undef; + $position{$id_b} = length($seed); + $old_id{$id_b} = $id_old; + $old_id2{$id_b} = undef; + + $insert_size2{$id_b} = $insert_size; + + my $count_contig_tmp = $contig_count; + while ($count_contig_tmp > 0) + { + $contig_gap_min{$id_b."_".$count_contig_tmp} = $contig_gap_min{$id_old."_".$count_contig_tmp}; + $contig_gap_max{$id_b."_".$count_contig_tmp} = $contig_gap_max{$id_old."_".$count_contig_tmp}; + $count_contig_tmp--; + } + + $contig_count++; + $contig_count{$id_b} = $contig_count; + my $gap_min = ($insert_size*0.62)-(2*$read_length)+$xy + $overlap-16 + $yx; + my $gap_max= ($insert_size*1.38)-(2*$read_length)+$xy + $overlap-16 + $yx; + $contig_gap_min{$id_b."_".$contig_count} = $gap_min; + $contig_gap_max{$id_b."_".$contig_count} = $gap_max; + goto ITERATION; +SAME_ID: + } + $yx++; + } + } + } + } + $xy--; + } + $id = $id_original; + $no_next_seed = "yes"; + $no_next_seed{$id} = "yes"; + goto FINISH; + } + elsif($circle eq "" && (($last_chance eq "yes" || $noforward eq "stop") && ($last_chance_back eq "yes" || $noback eq "stop")) || ($AT_rich eq "yes" && $count_seed ne "0") || ($bad_read ne "" && $count_seed ne "0")) + { + delete $seed{$id}; + delete $last_chance{$id}; + if ($y > $startprint2) + { + print OUTPUT5 "DELETE READS AND SEED\n"; + print OUTPUT5 ">".$id."\n"; + print OUTPUT5 $read."\n"; + } + if (exists($old_id{$id})) + { + my $read_tmp = $seed_old{$old_id{$id}}; + if (length($read_tmp) > 250) + { + $contigs{$contig_num."+".$old_id{$id}} = $read_tmp; + $contig_num++; + } + } + else + { + if (length($read) > 250) + { + if ($id =~ m/(.*_|)(\d+)$/) + { + my $check_id = $2; + if ($contig_num eq '1') + { + $contigs{$contig_num."+".$check_id} = $read; + my $start_point = '500'; + my $check_repetitive = '3'; + + while ($check_repetitive > 2) + { + my $check_for_rep = substr $read, 0, $start_point+200; + my $repetitive = substr $check_for_rep, $start_point, 15; + $check_repetitive = $check_for_rep =~ s/$repetitive/$repetitive/g; + { + $start_point += 20; + print OUTPUT5 "DETECT_REPETITIVE_IN_START_SEQUENCE\n"; + } + } + $first_contig_start = substr $read, $start_point, $overlap; + my $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + while ($check_start > 0) + { + $start_point += 10; + $first_contig_start = substr $read, $start_point, $overlap; + $check_start = $first_contig_start =~ tr/N|K|R|Y|S|W|M|B|D|H|V|\.//; + } + print OUTPUT5 $first_contig_start." CONTIG_START7\n"; + foreach my $seedie (keys %seed) + { + my $seedie_part = substr $seed{$seedie}, 0, 1000; + my $check_seedie = $seedie_part =~ s/$first_contig_start/$first_contig_start/; + if ($check_seedie > 0) + { + delete $seed{$seedie}; + } + } + } + else + { + $contigs{$contig_num."+".$check_id} = $read; + } + $contig_num++; + } + } + } + } + elsif($circle eq "yes" && $heteroplasmy eq "") + { + my $output_file = "Circularized_assembly_".$option."_".$project.".fasta"; + open(OUTPUT, ">" .$output_file) or die "Can't open file $output_file, $!\n"; + + delete $seed{$id}; + $read =~ tr/\./N/; + $read =~ tr/X//d; + my @contigs = split /L+/, $read; + my $l = '0'; + my $largest_contig = '0'; + my $miminum_contig = '100000000000000000000000000000'; + + print "\b" x length($progress_before); + print ' ' x length($progress_before); + print "\b" x length($progress_before); + print "\n-----------------Assembly ".$option." finished successfully: The genome has been circularized-----------------\n\n"; + print OUTPUT4 "\n-----------------Assembly ".$option." finished successfully: The genome has been circularized-----------------\n\n"; + print OUTPUT5 "\n-----------------Assembly ".$option." finished successfully: The genome has been circularized-----------------\n\n"; + $assembly_length = '1'; + foreach (@contigs) + { + $l++; + my $fin = $_; + my $fin2 = $fin; + $fin =~ s/(.{1,150})/$1\n/gs; + + if ($l > 1) + { + my $gap_min = sprintf("%.0f", $contig_gap_min{$id."_".($l-1)}); + my $gap_max = sprintf("%.0f", $contig_gap_max{$id."_".($l-1)}); + print "Estimated gap : ".$gap_min." bp to ".$gap_max." bp"; + print OUTPUT4 "Estimated gap : ".$gap_min." bp to ".$gap_max." bp"; + if ($contig_gap_min{$id."_".($l-1)} < 0) + { + print " (Check manually if the two contigs overlap to merge them together!)\n"; + print OUTPUT4 " (Check manually if the two contigs overlap to merge them together!)\n"; + } + else + { + print "\n"; + print OUTPUT4 "\n"; + } + } + print OUTPUT ">Contig".$l."\n"; + print OUTPUT $fin; + if (length($fin2) > $largest_contig) + { + $largest_contig = length($fin2); + } + if (length($fin2) < $miminum_contig) + { + $miminum_contig = length($fin2); + } + print "Contig ".$l." : ".length($fin2)." bp\n"; + print OUTPUT4 "Contig ".$l." : ".length($fin2)." bp\n"; + $assembly_length += length($fin2); + } + $assembly_success = "yes"; + if ($y > $startprint2) + { + print OUTPUT5 ">".$id."\n"; + print OUTPUT5 $read."\n\n\n"; + } + + print "\nTotal contigs : ".$l."\n"; + print "Largest contig : ".$largest_contig." bp\n"; + print "Smallest contig : ".$miminum_contig." bp\n"; + print "Average insert size : ".$insert_size." bp\n\n"; + print OUTPUT4 "\nTotal contigs : ".$l."\n"; + print OUTPUT4 "Largest contig : ".$largest_contig." bp\n"; + print OUTPUT4 "Smallest contig : ".$miminum_contig." bp\n"; + print OUTPUT4 "Average insert size : ".$insert_size." bp\n\n"; + $option++; + close OUTPUT; + $finish = "yes"; + } + elsif($circle eq "yes" && $heteroplasmy ne "") + { + $finish = "yes"; + delete $seed{$id}; + } + elsif ($circle ne "contigs" && $heteroplasmy eq "") + { + my $output_file2 = "Uncircularized_assemblies_".$option."_".$project.".fasta"; + open(OUTPUT2, ">" .$output_file2) or die "Can't open file $output_file2, $!\n"; + + if (length($read) > 250) + { + $contigs{$contig_num."+".$id} = $read; + $contig_num++; + } + + delete $seed{$id}; + + $read =~ tr/\./N/; + $read =~ tr/X//d; + my @contigs = split /L+/, $read; + my $l = '0'; + my $largest_contig = '0'; + my $miminum_contig = '100000000000000000000000000000'; + + print "\b" x length($progress_before); + print ' ' x length($progress_before); + print "\b" x length($progress_before); + print "\n-----------------Assembly ".$option." finished incomplete: The genome has not been circularized-----------------\n\n"; + print OUTPUT4 "\n-----------------Assembly ".$option." finished incomplete: The genome has not been circularized-----------------\n\n"; + print OUTPUT5 "\n-----------------Assembly ".$option." finished incomplete: The genome has not been circularized-----------------\n\n"; + + if ($y > $startprint2) + { + print OUTPUT5 ">".$id."\n"; + print OUTPUT5 $read."\n\n\n"; + } + + foreach my $contig_tmp (keys %contigs) + { + if ($contig_tmp =~ m/(\d+)\+*\d*/) + { + my $contig_tmp3 = $1; + if ($contig_tmp3 < 10) + { + $contigs{"0".$contig_tmp} = $contigs{$contig_tmp}; + delete $contigs{$contig_tmp}; + } + } + } + foreach my $contig_tmp (sort keys %contigs) + { + my $contig_tmp2; + if ($contig_tmp =~ m/(\d+)\+*\d*/) + { + $contig_tmp2 = $1; + } + $read = $contigs{$contig_tmp}; + $read =~ tr/\./N/; + $read =~ tr/X//d; + my @contigs = split /L+/, $read; + my $jj = '0'; + foreach (@contigs) + { + $l++; + $jj++; + my $fin = $_; + my $fin2 = $fin; + $fin =~ s/(.{1,150})/$1\n/gs; + + if ($jj > 1) + { + my $gap_min = sprintf("%.0f", $contig_gap_min{$id."_".($jj-1)}); + my $gap_max = sprintf("%.0f", $contig_gap_max{$id."_".($jj-1)}); + print "Estimated gap : ".$gap_min." bp to ".$gap_max." bp"; + print OUTPUT4 "Estimated gap : ".$gap_min." bp to ".$gap_max." bp"; + if ($contig_gap_min{$id."_".($jj-1)} < 0) + { + print " (Check manually if the two contigs overlap to merge them together!)\n"; + print OUTPUT4 " (Check manually if the two contigs overlap to merge them together!)\n"; + } + else + { + print "\n"; + print OUTPUT4 "\n"; + } + } + print OUTPUT2 ">Contig".$contig_tmp."\n"; + print OUTPUT2 $fin; + if (length($fin2) > $largest_contig) + { + $largest_contig = length($fin2); + } + if (length($fin2) < $miminum_contig) + { + $miminum_contig = length($fin2); + } + print "Contig ".$contig_tmp2." : ".length($fin2)." bp\n"; + print OUTPUT4 "Contig ".$contig_tmp2." : ".length($fin2)." bp\n"; + } + if ($y > $startprint2) + { + print OUTPUT5 ">Contig".$contig_tmp."\n"; + print OUTPUT5 $read."\n"; + } + } + print "\nTotal contigs : ".$l."\n"; + print "Largest contig : ".$largest_contig." bp\n"; + print "Smallest contig : ".$miminum_contig." bp\n"; + print "Average insert size : ".$insert_size." bp\n\n"; + print OUTPUT4 "\nTotal contigs : ".$l."\n"; + print OUTPUT4 "Largest contig : ".$largest_contig." bp\n"; + print OUTPUT4 "Smallest contig : ".$miminum_contig." bp\n"; + print OUTPUT4 "Average insert size : ".$insert_size." bp\n\n"; + $option++; + close OUTPUT2; + $finish = "yes"; + } + elsif($circle ne "contigs" && $heteroplasmy ne "") + { + $finish = "yes"; + delete $seed{$id}; + } + } + if ($y > $startprint2 && $benchmark_time eq "yes") + { + if (time-$time_back > 1) + { + print OUTPUT5 time-$time_back." TIME_END\n"; + } + } +} +$y++; + + if ($y eq $iterations) + { + foreach my $seedprint (keys %seed) + { + print OUTPUT5 "\n".$seedprint."\n"; + print OUTPUT5 $seed{$seedprint}."\n\n"; + } + } + if (@insert_size > 500 && (@insert_size < 8000 || $y < 10) && $insert_size_correct eq "yes" && $paired eq "PE") + { + my $insert_total = '0'; + my $k = '0'; + foreach my $insert1 (@insert_size) + { + $insert_total += $insert1; + $k++; + } + + my $insert_size_temp = $insert_total/$k; + $insert_size = int($insert_size_temp + $insert_size_temp/abs($insert_size_temp*2)); + if ($y > $startprint2) + { + print OUTPUT5 $insert_size." Insert Size\n"; + } + } + elsif (@insert_size >= 8000) + { + $insert_size_correct = ""; + $insert_range = $insert_range_b; + $insert_range_back = $insert_range_b; + } + elsif ($insert_size_correct ne "yes" && length($read) > $insert_size) + { + $insert_range = $insert_range_b; + $insert_range_back = $insert_range_b; + } +} +FINISH2: + + if ($finish ne "yes") + { + foreach my $seed_id_tmp (keys %seed) + { + if (length($seed{$seed_id_tmp}) > 250) + { + $contigs{$contig_num."+".$seed_id_tmp} = $seed{$seed_id_tmp}; + $contig_num++; + } + delete $seed{$seed_id_tmp}; + } + + foreach my $contig_tmp (keys %contigs) + { + if ($contig_tmp =~ m/(\d+)\+*\d*/) + { + my $contig_tmp3 = $1; + if ($contig_tmp3 < 10) + { + $contigs{"0".$contig_tmp} = $contigs{$contig_tmp}; + delete $contigs{$contig_tmp}; + } + } + } + } + my $count_contigs = keys %contigs; + + my $tree_succes = ""; + if($circle ne "yes" && $count_contigs > 1 && $heteroplasmy eq "") + { + open(OUTPUT7, ">" .$output_file7) or die "\nCan't open file $output_file7, $!\n"; + my $h = '0'; + my $terminate = '0'; + my %node; + my %row_nodes; + my %contig_num; + my %contigs2; + undef %contigs2; + my %repetitive; + undef %repetitive; + foreach my $contig_tmp (keys %contigs) + { + if ($contig_tmp =~ m/(\d+)\+*(.*)/) + { + my $contig_num = $1; + my $contig_code = $2; + if ($contig_code =~ m/.*_(\d+)$/) + { + $contig_code = $1; + } + $contig_num{$contig_code} = $contig_num; + $contigs2{$contig_num} = $contigs{$contig_tmp}; + } + } + print OUTPUT7 "LINKS BETWEEN CONTIGS\n"; + print OUTPUT7 "---------------------\n\n"; + foreach my $tree (keys %tree) + { + my $tree2 = $tree; + my $tree3 = $tree{$tree2}; + if ($tree2 =~ m/.*_(\d+(REP)*)$/) + { + $tree = $1; + } + if ($tree3 =~ m/.*_(\d+(REP))$/) + { + $tree3 = $1; + } + delete $tree{$tree2}; + $tree{$tree} = $tree3; + my $tree_tmp = $tree; + my $tree_tmp2 = $tree{$tree}; + + my @ids_split = split /\*/, $tree_tmp2; + foreach my $id_split (@ids_split) + { + foreach my $contig_num (keys %contig_num) + { + if ($id_split =~ m/^$contig_num(REP)*$/) + { + if ($tree_tmp2 =~ m/^(.*\*)*$contig_num(REP)*(\*.*)*$/) + { + if (defined($1)) + { + $tree_tmp2 = $1.$contig_num{$contig_num}; + } + else + { + $tree_tmp2 = $contig_num{$contig_num}; + } + if (defined($2)) + { + $tree_tmp2 = $tree_tmp2."REP"; + } + if (defined($3)) + { + $tree_tmp2 = $tree_tmp2.$3; + } + } + } + if ($contig_num eq $tree_tmp) + { + $tree_tmp = $contig_num{$contig_num}; + } + } + } + my $rep_test = substr $tree_tmp2, -3; + $tree_tmp2 =~ s/\*/ OR /g; + if ($rep_test eq "REP") + { + my $tree_tmp2a = substr $tree_tmp2,0, -3; + print OUTPUT7 $tree_tmp."----> REPETITIVE REGION----> ".$tree_tmp2a."\n"; + } + else + { + print OUTPUT7 $tree_tmp."----> ".$tree_tmp2."\n"; + } + } + if (exists($tree{"START"})) + { + $row{$h} = "01"; + $node{$h} = $tree{"START"}; + } + if ($hasL eq "yes") + { + print OUTPUT7 "\n(Contigs broken up by long homopolymer stretches are linked together as one contig with 15 N's, they can be merged manually in some cases)\n"; + print OUTPUT7 "\n(If the region before and after the N zone overlaps, you can merge them by deleting the regions next to N zone from both sides (those are the least reliable))\n\n"; + + } + my %row_circle; + undef %row_circle; +TERMINATE: while (keys %node) + { + foreach my $h1 (keys %node) + { + undef %row_nodes; + %row_nodes = map { $_ => undef } split(/\+/, $row{$h1}); + my @row_nodes = map { $_ => undef } split(/\+/, $row{$h1}); + if(exists($tree{$node{$h1}})) + { + if ($tree{$node{$h1}} eq $node{$h1}) + { + delete $node{$h1}; + delete $row{$h1}; + } + elsif ($tree{$node{$h1}} eq "END_REVERSE") + { + delete $node{$h1}; + delete $row{$h1}; + } + elsif ($tree{$node{$h1}} eq "END") + { + delete $node{$h1}; + $row_circle{$h1} = $row{$h1}; + } + elsif ($tree{$node{$h1}} =~ m/(.*)\*(.*)/) + { + my @id_node = split /\*/, $tree{$node{$h1}}; + foreach my $id_node (@id_node) + { + my $count1 = '0'; + foreach my $row_n (@row_nodes) + { + if ($id_node eq $row_n) + { + $count1++; + } + } + if ($id_node eq $node{$h1}) + { + } + elsif ($count1 > 0) + { + } + else + { + $h++; + $node{$h} = $id_node; + $row{$h} = $row{$h1}."+".$id_node; + } + } + delete $node{$h1}; + delete $row{$h1}; + } + elsif ($tree{$node{$h1}} =~ m/(.*)REP/) + { + $h++; + $row{$h} = $row{$h1}."+".$1."R"; + $node{$h} = $1; + delete $node{$h1}; + delete $row{$h1}; + $repetitive{$1} = undef; + } + else + { + $h++; + if ($row{$h1} =~ m/^(.*)\+\d+$/) + { + $row{$h1} = $1; + } + if ($row{$h1} =~ m/^01$/) + { + $row{$h} = $row{$h1} + } + else + { + $row{$h} = $row{$h1}."+".$tree{$node{$h1}}; + } + $node{$h} = $tree{$node{$h1}}; + delete $node{$h1}; + delete $row{$h1}; + } + } + else + { + delete $node{$h1}; + delete $row{$h1}; + } + $terminate++; + if ($terminate > 1500) + { + delete $row{$h1}; + last TERMINATE; + } + } + } + my $g = '1'; + + foreach my $row (keys %row_circle) + { + foreach my $contig_num (keys %contig_num) + { + $row{$row} =~ s/\+$contig_num\+/\+$contig_num{$contig_num}\+/g; + $row{$row} =~ s/\+$contig_num$/\+$contig_num{$contig_num}/g; + $row{$row} =~ s/\+$contig_num(R)\+/\+$contig_num{$contig_num}R\+/g; + $row{$row} =~ s/\+$contig_num(R)$/\+$contig_num{$contig_num}R/g; + } + + my @row = split /\+/,$row{$row}; + my $assembly = ""; + + foreach my $cont (@row) + { + my $check = $cont =~ tr/R//d; + if (exists($contigs2{$cont})) + { + my $repe2 = substr $assembly, -1; + if ($cont eq '1' || $cont eq '01') + { + $assembly = $contigs2{$cont}; + } + elsif ($check > 0) + { + $assembly .= "RRRRRRRRRRRRRR".$contigs2{$cont}; + } + else + { + my $end_assembly = substr $assembly, -30; + my $end_assembly2 = substr $assembly, -80 , 30; + $end_assembly =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + $end_assembly2 =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $start_next_contig = substr $contigs2{$cont},0 ,500; + $start_next_contig =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $start_next_contig2 = $start_next_contig; + my $x1 = length($start_next_contig); + $start_next_contig =~ s/.*.$end_assembly//; + my $x2 = length($start_next_contig); + $start_next_contig2 =~ s/.*.$end_assembly2//; + my $x2b = length($start_next_contig2); + if ($x1-$x2 ne '0') + { + my $assembly_tmp = substr $assembly, 0, -($x1-$x2); + $assembly = $assembly_tmp.$contigs2{$cont}; + } + elsif ($x1-$x2b ne '0') + { + my $assembly_tmp = substr $assembly, 0, -($x1-$x2b+50); + $assembly = $assembly_tmp.$contigs2{$cont}; + } + else + { + $assembly .= "2RRRRRRRRRRRRRR".$contigs2{$cont}; + } + } + } + } + my $start_assembly = substr $assembly, 50, 30; + $start_assembly =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $end_last_contig = substr $assembly, -1000; + $end_last_contig =~ tr/N|K|R|Y|S|W|M|B|D|H|V/\./; + my $x1 = length($end_last_contig); + $end_last_contig =~ s/(.)$start_assembly.*/$1/; + my $x2 = length($end_last_contig); + my $assembly2; + if (-$x1+$x2 ne '0') + { + $assembly2 = substr $assembly, 50, -$x1+$x2; + } + else + { + $assembly2 = $assembly; + } + $assembly = $assembly2; + $assembly =~ tr/\./N/; + + if (length($assembly) > 500) + { + print OUTPUT7 "\n\nOPTION ".$g."\n-------------------\n"; + print OUTPUT7 "Contig Arrangement : "; + print OUTPUT7 $row{$row}."\n"; + print OUTPUT7 "Assembly length : ".length($assembly)." bp\n\n"; + my @contigs0 = split /RRRRRRRRRRRRRR/, $assembly; + my $size_c = @contigs0; + my $jj = '0'; + my $t ='0'; + my @order = split /\+/, $row{$row}; + foreach (@contigs0) + { + $jj++; + my $fin = $_; + $fin =~ tr/L/N/; + my $order = ""; + my $h = '0'; +ORDER: foreach my $tmp (@order) + { + if ($h < $t) + { + $h++; + next ORDER; + } + my $R = $tmp =~ s/RRRRRRRRRRRRRR//g; + if ($R ne '1'&& $h > 0) + { + $order .= "+".$tmp; + } + elsif ($R ne '1' && $h eq '0') + { + $order = $tmp; + } + if ($R > 0 && $h ne '0') + { + last ORDER; + } + $t++; + $h++; + } + my $gg = substr $order, 0, 1; + if ($gg eq "+") + { + substr $order, 0, 1, ""; + } + + print OUTPUT7 ">Contig ".$order."\n"; + my $fail = substr $fin, -1; + if ($fail eq '2') + { + substr $fin, -1, 1, ""; + } + print OUTPUT7 $fin."\n"; + + if ($size_c > 1 && $jj < $size_c && $fail ne '2') + { + print OUTPUT7 "\n-------Repetitive region detected, exact length unknown-------\n\n"; + } + elsif ($size_c > 1 && $jj < $size_c && $fail eq '2') + { + print OUTPUT7 "\n-------Couldn't merge automatically, try manually-------\n\n"; + } + } + + print OUTPUT7 "\n"; + + my $output_file9 = "Option_".$g."_".$project.".fasta"; + open(OUTPUT9, ">" .$output_file9) or die "Can't open file $output_file9, $!\n"; + $assembly =~ tr/L/N/; + my @contigs2 = split /RRRRRRRRRRRRRR/, $assembly; + my $ww = '0'; + foreach (@contigs2) + { + $ww++; + my $fin = $_; + my $fin2 = $fin; + $fin =~ s/(.{1,150})/$1\n/gs; + + print OUTPUT9 ">Contig".$ww."\n"; + print OUTPUT9 $fin; + } + close OUTPUT9; + $g++; + + if (length($assembly) > $genome_range_low && length($assembly) < $genome_range_high) + { + $tree_succes = "yes"; + $assembly_length = length($assembly); + } + } + } + print OUTPUT7 "\n\nEach option has a separate fasta file\n"; + } + elsif ($count_contigs eq 1) + { + foreach my $contig_tmp1 (keys %contigs) + { + $assembly_length = length($contigs{$contig_tmp1}); + } + } + if ($finish ne "yes" && $heteroplasmy eq "") + { + my $output_file2 = "Contigs_".$option."_".$project.".fasta"; + open(OUTPUT2, ">" .$output_file2) or die "Can't open file $output_file2, $!\n"; + + my $l = '0'; + my $largest_contig = '0'; + my $miminum_contig = '1000000000000000000000000000000000'; + + print "\b" x length($progress_before); + print ' ' x length($progress_before); + print "\b" x length($progress_before); + print "\n------------Assembly ".$option." finished: Contigs are automatically merged in Merged_contigs file------------\n\n"; + print OUTPUT4 "\n------------Assembly ".$option." finished: Contigs are automatically merged in Merged_contigs file------------\n\n"; + print OUTPUT5 "\n------------Assembly ".$option." finished: Contigs are automatically merged in Merged_contigs file------------\n\n"; + + + foreach my $contig_tmp (sort keys %contigs) + { + my $contig_tmp2; + if ($contig_tmp =~ m/(\d+)\+*\d*/) + { + $contig_tmp2 = $1; + } + + $read = $contigs{$contig_tmp}; + $read =~ tr/\./N/; + $read =~ tr/X//d; + my @contigs = split /L+/, $read; + my $jj = '0'; + + foreach (@contigs) + { + $l++; + $jj++; + my $fin = $_; + my $fin2 = $fin; + $fin =~ s/(.{1,150})/$1\n/gs; + + if ($jj > 1) + { + my $gap_min = sprintf("%.0f", $contig_gap_min{$id."_".($jj-1)}); + my $gap_max = sprintf("%.0f", $contig_gap_max{$id."_".($jj-1)}); + print "Estimated gap : ".$gap_min." bp to ".$gap_max." bp"; + print OUTPUT4 "Estimated gap : ".$gap_min." bp to ".$gap_max." bp"; + if ($contig_gap_min{$id."_".($jj-1)} < 0) + { + print " (Check manually if the two contigs overlap to merge them together!)\n"; + print OUTPUT4 " (Check manually if the two contigs overlap to merge them together!)\n"; + } + else + { + print "\n"; + print OUTPUT4 "\n"; + } + } + print OUTPUT2 ">Contig".$contig_tmp."\n"; + print OUTPUT2 $fin; + if (length($fin2) > $largest_contig) + { + $largest_contig = length($fin2); + } + if (length($fin2) < $miminum_contig) + { + $miminum_contig = length($fin2); + } + print "Contig ".$contig_tmp2." : ".length($fin2)." bp\n"; + print OUTPUT4 "Contig ".$contig_tmp2." : ".length($fin2)." bp\n"; + } + if ($y > $startprint2) + { + print OUTPUT5 ">Contig".$contig_tmp."\n"; + print OUTPUT5 $read."\n"; + } + } + + print "\nTotal contigs : ".$l."\n"; + print "Largest contig : ".$largest_contig." bp\n"; + print "Smallest contig : ".$miminum_contig." bp\n"; + print "Average insert size : ".$insert_size." bp\n\n"; + print OUTPUT4 "\nTotal contigs : ".$l."\n"; + print OUTPUT4 "Largest contig : ".$largest_contig." bp\n"; + print OUTPUT4 "Smallest contig : ".$miminum_contig." bp\n"; + print OUTPUT4 "Average insert size : ".$insert_size." bp\n\n"; + $option++; + + close OUTPUT2 + } + + + + if ($hp_seed_assemble ne "") + { + goto HP0; + } + if ($heteroplasmy ne "" ) + { + print "\b" x length($progress_before); + print ' ' x length($progress_before); + print "\b" x length($progress_before); + print "\n"; + } + my $total_reads_organelle = (keys %count_reads)*2; + my $total_reads_organelle_all = (keys %count_reads_all)*2; + my $total_reads = (keys %hash)*2; + my $organelle_percentage = sprintf("%.2f",($total_reads_organelle_all*100)/$total_reads); + my $average_coverage = sprintf("%.0f",$total_reads_organelle_all*$read_length/$assembly_length); + + print "\n-----------------------------------------Input data metrics-----------------------------------------\n\n"; + print "Total reads : ".$total_reads."\n"; + print "Aligned reads : ".$total_reads_organelle_all."\n"; + print "Assembled reads : ".$total_reads_organelle."\n"; + if ($assembly_success eq "yes" || $tree_succes eq "yes") + { + print "Organelle genome % : ".$organelle_percentage." %\n"; + } + if ($assembly_length > 5000) + { + print "Average organelle coverage : ".$average_coverage."\n"; + } + if ($heteroplasmy ne "") + { + my $count_hp = '0'; + foreach my $variance (sort { $a <=> $b } keys %variance_all) + { + $count_hp++; + } + print "\nTotal intra-individual polymorphic elements : ".$count_hp."\n"; + } + print "\n----------------------------------------------------------------------------------------------------\n\n"; + print OUTPUT4 "\n-----------------------------------------Input data metrics-----------------------------------------\n\n"; + print OUTPUT4 "Total reads : ".$total_reads."\n"; + print OUTPUT4 "Aligned reads : ".$total_reads_organelle_all."\n"; + print OUTPUT4 "Assembled reads : ".$total_reads_organelle."\n"; + if ($assembly_success eq "yes" || $tree_succes eq "yes") + { + print OUTPUT4 "Organelle genome % : ".$organelle_percentage." %\n"; + } + if ($assembly_length > 5000) + { + print OUTPUT4 "Average organelle coverage : ".$average_coverage."\n"; + } + if ($heteroplasmy ne "") + { + my $count_hp = '0'; + foreach my $variance (sort { $a <=> $b } keys %variance_all) + { + $count_hp++; + } + print OUTPUT4 "\nTotal intra-individual polymorphic elements : ".$count_hp."\n"; + } + print OUTPUT4 "\n----------------------------------------------------------------------------------------------------\n\n"; +HP0: +if ($variance_detection eq "yes") +{ + foreach my $variance (sort { $a <=> $b } keys %variance_all) + { + print OUTPUT12 $variance_all{$variance}."\n"; + } +} +if ($heteroplasmy ne "" && $hp_back eq "" && $hp_seed_assemble eq "") +{ + $hp_back = "yes"; + print OUTPUT5 $read." READ\n"; + print OUTPUT5 $read_new." READ_NEW\n"; + $seed_input = substr $read_new, -100; + $y = '1'; + $last_150 = ""; + $first_150 = $read_length; + goto HP_BACK; +} +elsif ($heteroplasmy ne "" && $hp_back ne "") +{ + $hp_back = ""; +} +foreach my $variance (sort { $a <=> $b } keys %variance_all) +{ + print OUTPUT13 $variance_all{$variance}."\n"; +} +if ($heteroplasmy ne "" && $heteroplasmy eq "uiggb") +{ + undef %SNPs; + if (%linked_half_SNPs2) + { + if ($linked_half_SNPs2_check ne "yes") + { + foreach my $variance (sort { $a <=> $b } keys %variance_all) + { + print OUTPUT13 $variance_all{$variance}." 1\n"; + my $h = '0'; + foreach my $linked_snp (sort { $a <=> $b }keys %linked_SNPs) + { + print OUTPUT13 " ".$linked_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " LINKED_SNPs1\n"; + } + + $h = '0'; + foreach my $linked_half_snp (sort { $a <=> $b }keys %linked_half_SNPs) + { + print OUTPUT13 " ".$linked_half_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " HALF_LINKED_SNPs1\n"; + } + + $h = '0'; + foreach my $not_linked_snp (sort { $a <=> $b }keys %not_linked_SNPs) + { + print OUTPUT13 " ".$not_linked_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " NOT_LINKED_SNPs1\n"; + } + $linked_half_SNPs2_check = "yes"; + if ($hp_seed_assemble ne "") + { + print OUTPUT14 ">".$id."\n"; + print OUTPUT14 $read."\n"; + } + last; + } + } + foreach my $linked_half_SNPs2 (keys %linked_half_SNPs2) + { + print OUTPUT14 ">".$linked_half_SNPs2{$linked_half_SNPs2}."\n"; + print OUTPUT14 $linked_half_SNPs2."\n"; + my @split = split /\+/, $linked_half_SNPs2{$linked_half_SNPs2}; + my @split2 = split /\-/, $split[1]; + undef %linked_SNPs; + foreach my $split2 (@split2) + { + if ($split2 ne "") + { + $linked_SNPs{$split2} = undef; + $SNPs{$split2} = undef; + } + } + + $seed{$split[0]} = $linked_half_SNPs2; + $seed_input_new2 = $linked_half_SNPs2; + $position{$split[0]} = length($linked_half_SNPs2); + $position_back{$split[0]} = '0'; + print OUTPUT5 $linked_half_SNPs2." SEQ_TEST2\n"; + $first_contig_start = ""; + $hp_seed_assemble_last_chance = ""; + $hp_seed_assemble = "yes"; + $last_150 = ""; + undef %accepted_SNPs; + undef %accepted_SNPs_back; + undef %accepted_SNPs_pair; + undef %accepted_SNPs_pair_back; + + undef %linked_half_SNPs; + undef %not_linked_SNPs; + $y = '1'; + delete $linked_half_SNPs2{$linked_half_SNPs2}; + goto REF2; + } + undef %linked_half_SNPs2; + } + elsif ($hp_seed_assemble ne "") + { + print OUTPUT14 ">".$id."\n"; + print OUTPUT14 $read."\n"; + } + undef %linked_half_SNPs_exclude; + if ($first_linked_half_SNP_pos ne "" && $first_linked_half_SNP_pos ne "yes") + { + foreach my $variance (sort { $a <=> $b } keys %variance_all) + { + print OUTPUT13 $variance_all{$variance}."\n"; + my $h = '0'; + foreach my $linked_snp (sort { $a <=> $b }keys %linked_SNPs) + { + print OUTPUT13 " ".$linked_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " LINKED_SNPs2\n"; + } + + $h = '0'; + foreach my $linked_half_snp (sort { $a <=> $b }keys %linked_half_SNPs) + { + print OUTPUT13 " ".$linked_half_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " HALF_LINKED_SNPs2\n"; + } + + $h = '0'; + foreach my $not_linked_snp (sort { $a <=> $b }keys %not_linked_SNPs) + { + print OUTPUT13 " ".$not_linked_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " NOT_LINKED_SNPs2\n"; + } + last; + } + + my $first_linked_half_SNP_read_tmp = $first_linked_half_SNP_read; + $first_linked_half_SNP_read = substr $first_linked_half_SNP_read_tmp, -90; + $seed{$first_linked_half_SNP_pos} = $first_linked_half_SNP_read; + $seed_input_new2 = $first_linked_half_SNP_read; + $position{$first_linked_half_SNP_pos} = length($first_linked_half_SNP_read); + $position_back{$first_linked_half_SNP_pos} = '0'; + print OUTPUT5 $first_linked_half_SNP_read." SEQ_TEST\n"; + + $hp_seed_assemble = "yes"; + $SNPs{length($first_linked_half_SNP_read)-$first_linked_half_SNP_pos+1} = undef; + + %linked_half_SNPs_exclude = %linked_half_SNPs; + $first_linked_half_SNP_pos = "yes"; + $hp_seed_assemble_last_chance = ""; + $first_contig_start = ""; + undef %accepted_SNPs; + undef %accepted_SNPs_back; + undef %accepted_SNPs_pair; + undef %accepted_SNPs_pair_back; + undef %linked_SNPs; + undef %linked_half_SNPs; + $y = '1'; + goto REF2; + } + my $w = '0'; + my $pos_prev = '0'; + my $pos_tmp; + my $seq_test; + my $hp_seed; + my %hp_deleted; + my $add_no_linkage = ""; + my $add_linkage = ""; + +HP: foreach my $variance (sort { $a <=> $b } keys %variance_all) + { + my @variances = split /\t/, $variance_all{$variance}; + + if ($w > 0) + { + my $next_pos; + foreach my $variance_tmp (sort { $a <=> $b } keys %variance_all) + { + if ($next_pos eq "yes") + { + $next_pos = $variance_tmp; + last; + } + if ($variance_tmp eq $pos_tmp) + { + $next_pos = "yes"; + } + } + my $length_extra = $next_pos-$pos_tmp-1; + my $extra = substr $hashref2{$current_pos}, 0, $length_extra; + $hp_seed .= $extra; + + if (length($hp_seed) <= $overlap+5 && $length_extra > 30) + { + my $extra2 = substr $hashref2{$current_pos+30}, 0, $length_extra-30; + $hp_seed .= $extra2; + } + + $seed{$pos_tmp} = $hp_seed; + $seed_input_new2 = $hp_seed; + $position{$pos_tmp} = length($hp_seed); + print OUTPUT5 $hp_seed." SEQ_TEST\n"; + $w = '0'; + last HP; + } + if ($hp_seed_assemble ne "" && $linked_half_SNPs2_check ne "yes") + { + print OUTPUT13 $variance_all{$variance}." 2\n"; + my $h = '0'; + foreach my $linked_snp (sort { $a <=> $b }keys %linked_SNPs) + { + print OUTPUT13 " ".$linked_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " LINKED_SNPs3\n"; + } + + $h = '0'; + foreach my $linked_half_snp (sort { $a <=> $b }keys %linked_half_SNPs) + { + print OUTPUT13 " ".$linked_half_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " HALF_LINKED_SNPs3\n"; + } + + $h = '0'; + foreach my $not_linked_snp (sort { $a <=> $b }keys %not_linked_SNPs) + { + print OUTPUT13 " ".$not_linked_snp; + $h++; + } + if ($h > 0) + { + print OUTPUT13 " NOT_LINKED_SNPs3\n"; + } + $hp_deleted{$variance} = undef; + $pos_prev = $variances[1]; + $hp_seed_assemble = ""; + next HP; + } + elsif ($hp_seed_assemble ne "" && $linked_half_SNPs2_check eq "yes") + { + $linked_half_SNPs2_check = ""; + $hp_deleted{$variance} = undef; + $pos_prev = $variances[1]; + $hp_seed_assemble = ""; + next HP; + } + $pos_tmp = $variances[1]; + my $ref_tmp = $variances[3]; + my $ref_alt_tmp = $variances[4]; + my @ref_alt_tmp = split /,/, $ref_alt_tmp; + $seq_test = ""; + my $add_linked_SNP = ""; + my $add_linked_SNP_alt = ""; + + if (exists($not_linked_SNPs{$pos_tmp})) + { + my $prev_SNP = ""; + print OUTPUT5 $current_pos." CURRENT_POS\n"; + foreach my $variance (sort { $a <=> $b } keys %variance_all_SNP) + { + if ($variance eq $current_pos-1) + { + print OUTPUT5 $prev_SNP." PREV_POS\n"; + $add_no_linkage = $variance; + + if ($prev_SNP ne "") + { + $current_pos = $prev_SNP+1; + } + else + { + $current_pos = '1'; + } + last; + } + $prev_SNP = $variance; + } + } + elsif (exists($linked_SNPs{$pos_tmp})) + { + if (exists($linked_half_SNPs{$pos_tmp})) + {} + else + { + my $prev_SNP = ""; + my $current_pos_tmp = $current_pos; + print OUTPUT5 $current_pos." CURRENT_POS1\n"; + foreach my $variance (sort { $a <=> $b } keys %variance_all_SNP) + { + if ($variance eq $current_pos-1) + { + print OUTPUT5 $prev_SNP." PREV_POS1\n"; + $add_linkage = $variance; + + if ($prev_SNP ne "") + { + $current_pos = $prev_SNP+1; + } + else + { + $current_pos = '1'; + } + $add_linked_SNP = $current_pos_tmp - $current_pos-1; + my @ref_alt_tmp = split /,/, $variance_all_SNP{$variance}; + $add_linked_SNP_alt = $ref_alt_tmp[1]; + last; + } + $prev_SNP = $variance; + } + } + } + if ($pos_tmp > $current_pos+29) + { + while ($current_pos <= $pos_tmp) + { + $seq_test .= $hashref2{$current_pos}; + $current_pos += 30; + } + if ($current_pos > $pos_tmp) + { + substr $seq_test, -($current_pos-$pos_tmp)+1, $current_pos-$pos_tmp, ""; + $current_pos -= ($current_pos-$pos_tmp-1); + } + } + else + { + $seq_test = substr $hashref2{$current_pos}, 0, $pos_tmp-$current_pos+1; + $current_pos += ($pos_tmp-$current_pos+1); + } + if ($add_linked_SNP ne "") + { + substr $seq_test, $add_linked_SNP, 1, $add_linked_SNP_alt; + } + $hp_seed = substr $seq_test, -($read_length*0.8); + substr $hp_seed, -1, 1, $ref_alt_tmp[1]; + $hp_seed_assemble = "yes"; + + $SNPs{length($hp_seed)} = $ref_alt_tmp[1]; + if ($add_linked_SNP ne "" && (length($seq_test)-$add_linked_SNP) <= length($hp_seed)) + { + my $pos_SNP = length($hp_seed)-(length($seq_test)-$add_linked_SNP)+1; + $SNPs{$pos_SNP} = $add_linked_SNP_alt; + } + $w++; + } + + if ($w > 0) + { + my $next_pos; + foreach my $variance_tmp (sort { $a <=> $b } keys %variance_all) + { + if ($next_pos eq "yes") + { + $next_pos = $variance_tmp; + last; + } + if ($variance_tmp eq $pos_tmp) + { + $next_pos = "yes"; + } + } + my $length_extra = $next_pos-$pos_tmp-1; + my $extra = substr $hashref2{$current_pos}, 0, $length_extra; + $hp_seed .= $extra; + + if (length($hp_seed) <= $overlap+5 && $length_extra > 30) + { + my $extra2 = substr $hashref2{$current_pos+30}, 0, $length_extra-30; + $hp_seed .= $extra2; + } + $seed{$pos_tmp} = $hp_seed; + $seed_input_new2 = $hp_seed; + $position{$pos_tmp} = length($hp_seed); + $position_back{$pos_tmp} = '0'; + print OUTPUT5 $hp_seed." SEQ_TEST\n"; + } + + my $count_seed = '0'; + foreach my $count_seed2 (keys %seed) + { + $count_seed++; + } + if ($count_seed > 0) + { + foreach (keys %hp_deleted) + { + delete $variance_all{$_}; + } + $hp_seed_assemble_last_chance = ""; + $first_contig_start = ""; + $last_150 = ""; + undef %accepted_SNPs; + undef %accepted_SNPs_back; + undef %accepted_SNPs_pair; + undef %accepted_SNPs_pair_back; + undef %linked_SNPs; + undef %linked_half_SNPs; + undef %not_linked_SNPs; + if ($add_linkage ne "") + { + $linked_SNPs{$add_linkage} = undef; + } + if ($add_no_linkage ne "") + { + $not_linked_SNPs{$add_no_linkage} = undef; + } + $linked_SNPs{$pos_tmp} = undef; + + $y = '1'; + goto REF2; + } +} +print "\nThank you for using NOVOPlasty!\n\n"; +close INPUT; +close OUTPUT4; +close OUTPUT5; +close OUTPUT6; +close OUTPUT7; +close OUTPUT10; +close OUTPUT11; +close OUTPUT12; +close OUTPUT13; +close OUTPUT14;