diff --git a/HiNT/externalScripts/extract_nonHiC.pl b/HiNT/externalScripts/extract_nonHiC.pl index a54d942..b5891bb 100644 --- a/HiNT/externalScripts/extract_nonHiC.pl +++ b/HiNT/externalScripts/extract_nonHiC.pl @@ -7,7 +7,7 @@ # Input: # Pairsam file # Enzyme: -# Mbo1/Dpn2/HindIII +# MboI, DpnII, HindIII, AluI, NotI, NcoI, Arima, 'MboI+HinfI', 'HinfI+MboI' # All output options: # FASTQ file: Softclipped part of the non Hi-C chmeric reads # {OUTPUT_PREFIX}_2.txt File contains non Hi-C chimeric reads (softclipped).\n"; @@ -22,24 +22,21 @@ use strict; use POSIX; use Getopt::Long; -use Data::Dumper; use constant { true => 1, false => 0 }; -use experimental 'smartmatch'; use utf8; use open qw(:std :utf8); ######---------------------------------------------------------------------------------------------------------------- -### inputs +### inputs, variables etc ######---------------------------------------------------------------------------------------------------------------- my $psam = ""; -my $outprefix = "project"; -my $re = "MboI"; +my $re = ""; my $sites = ""; +my $outprefix = "project"; my $cutoff = 2; -my $min_clip = 28; +my $min_clip = 7; my $help = 0; - Getopt::Long::GetOptions( 'psam=s' => \$psam, 're=s' => \$re, @@ -51,61 +48,69 @@ 'h' => \$help, ) or die "Incorrect input! Use -h for usage.\n"; -if ($help) { - print "\nUsage: perl extract_nonHiC.pl -psam psam.gz -re [STRING] -sites sites.txt -outprefix [STRING] -margin [INT,2] -min_clip [INT,28]\n\n\n"; - print "required options:\n"; - print " -psam The Input psam file\n"; - print " -sites RE sites along the genome (produced using 'generate_site_positions.py' script from Juicer package)\n"; - print " -re Name of the RE [MboI, DpnII, HindIII]\n"; - print "optional options:\n"; - print " -outprefix Output file PREFIX [default: project]\n"; - print " -margin Upstream Margin for finding genomic RE site [default: 2]\n"; - print " -min_clip Minimum length of softclipped part on the read [default : 28]\n"; - print " -help|-h Display usage information.\n\n\n"; - print "Default outputs:\n"; - print " {OUTPUT_PREFIX}_2.txt File contains non Hi-C chimeric reads (softclipped).\n"; - print " {OUTPUT_PREFIX}_3.txt File contains non Hi-C chimeric reads (reads with MAPQ<30, whole read sequence is reported).\n"; - print " {OUTPUT_PREFIX}_2dedup.txt File contains DUPLICATE REMOVED non Hi-C chimeric reads (softclipped).\n"; - print " {OUTPUT_PREFIX}_3dedup.txt File contains DUPLICATE REMOVED non Hi-C chimeric reads (reads with MAPQ<30, whole read sequence is reported).\n\n\n"; - print "Other hardcoded parameters:\n"; - print " Enzyme collection: Mbo1/Dpn2/HindIII\n"; - print " Proximity filter for mapping two mates of the pair : 500bp \n\n\n"; - exit 0; +sub help{ + my $j = shift; + if($j){ + print "\nUsage: perl extract_nonHiC.pl -psam psam.gz -re [STRING] -sites sites.txt -outprefix [STRING] -margin [INT,2] -min_clip [INT,7]\n\n\n"; + print "required options:\n"; + print " -psam The Input psam file\n"; + print " -sites RE sites along the genome (produced using 'generate_site_positions.py' script from Juicer package)\n"; + print " -re Name of the RE [MboI, DpnII, HindIII, AluI, NotI, NcoI, Arima, 'MboI+HinfI', 'HinfI+MboI']\n"; + print "optional options:\n"; + print " -outprefix Output file PREFIX [default: project]\n"; + print " -margin Margin for finding genomic RE site [default: 2]\n"; + print " -min_clip Minimum length of softclipped part on the read [default : 7]\n"; + print " -help|-h Display usage information.\n\n\n"; + print "Default outputs:\n"; + #print " {OUTPUT_PREFIX}_2.txt File contains non Hi-C chimeric reads (softclipped).\n"; + #print " {OUTPUT_PREFIX}_2dedup.txt File contains DUPLICATE REMOVED non Hi-C chimeric reads (softclipped).\n"; + print " {OUTPUT_PREFIX}_3.txt File contains non Hi-C chimeric reads (reads with MAPQ<30, whole read sequence is reported).\n"; + print " {OUTPUT_PREFIX}_3dedup.txt File contains DUPLICATE REMOVED non Hi-C chimeric reads (reads with MAPQ<30, whole read sequence is reported).\n\n\n"; + print "Other hardcoded parameters:\n"; + print " Enzyme collection: MboI, DpnII, HindIII, AluI, NotI, NcoI, Arima, 'MboI+HinfI', 'HinfI+MboI'\n"; + print " Proximity filter for mapping two mates of the pair : 500bp \n\n\n"; + exit 1; + } } ######### Global varialbes my %counts; -my $motif; -my $len_RE = 8; - -if($re eq "DpnII" or $re eq "MboI"){ - $motif = 'GATC'; -}elsif($re eq "HindIII"){ - $motif = "AAGCTT"; -}else{ - print " Restriction enzyme not specified. Exiting!\n"; - exit 1; -} -$len_RE = 8 if($motif eq "GATC"); -$len_RE = 10 if($motif eq "AAGCTT"); -#$sites = "/n/data1/hms/dbmi/park/Dhawal/Genomes/hg19_annotations/hg19_DpnII.txt"; +my %redb; +$redb{"AluI"}{motif} = "AGCT|TCGA"; +$redb{"AluI"}{mlen}=4; +$redb{"NotI"}{motif} = "GCGGCCGGCCGC|CGCCGGCCGGCG"; +$redb{"NotI"}{mlen}=8; +$redb{"MboI"}{motif} = "GATCGATC|CTAGCTAG"; +$redb{"MboI"}{mlen}=4; +$redb{"DpnII"}{motif} = "GATCGATC|CTAGCTAG"; +$redb{"DpnII"}{mlen}=4; +$redb{"HindIII"}{motif} = "AAGCTAGCTT|TTCGATCGAA"; +$redb{"HindIII"}{mlen}=6; +$redb{"NcoI"}{motif} = "CCATGCATGG|GGTACGTACC"; +$redb{"NcoI"}{mlen}=6; +$redb{"MboI+HinfI"}{motif} = "GATCGATC|CTAGCTAG|GA[ATGC]{1}TA[ATGC]{1}TC|GATCA[ATGC]{1}TC|GA[ATGC]{1}TGATC|CT[GATC]{1}AT[AGTC]{1}AG|CT[ATGC]{1}ACTAG|CTAGT[ATGC]{1}AG"; +$redb{"MboI+HinfI"}{mlen}=5; +$redb{"HinfI+MboI"}{motif} = "GATCGATC|CTAGCTAG|GA[ATGC]{1}TA[ATGC]{1}TC|GATCA[ATGC]{1}TC|GA[ATGC]{1}TGATC|CT[GATC]{1}AT[AGTC]{1}AG|CT[ATGC]{1}ACTAG|CTAGT[ATGC]{1}AG"; +$redb{"HinfI+MboI"}{mlen}=5; +$redb{"Arima"}{motif} = "GATCGATC|CTAGCTAG|GA[ATGC]{1}TA[ATGC]{1}TC|GATCA[ATGC]{1}TC|GA[ATGC]{1}TGATC|CT[GATC]{1}AT[AGTC]{1}AG|CT[ATGC]{1}ACTAG|CTAGT[ATGC]{1}AG"; +$redb{"Arima"}{mlen}=5; ######---------------------------------------------------------------------------------------------------------------- ### checks and primary readings ######---------------------------------------------------------------------------------------------------------------- +help($help); +if($re eq "" or !$redb{$re}{motif} or !$redb{$re}{mlen} ){ + print "RE not recognized\n"; + help(1); +} if($psam eq ""){ print "ERROR: Required input psam file not provided\n"; - exit 1; + help(1); } if($sites eq ""){ print "ERROR: Required genomic TE location file not provided\n"; - exit 1; + help(1); } -if($motif eq ""){ - print "ERROR: Required value for RE motif not provided\n"; - exit 1; -} - ### Read Juicer formatted RE motif locations as a hash open FILE, $sites or die $!; @@ -137,26 +142,25 @@ s/\n//; s/\r//; my @psam = split(/\t+/); + $counts{flag}{$psam[7]}++; #flag count - $counts{pairs}{total}++; if($counts{pairs}{total} % 100000 == 0 ){ print "Read-pairs: $counts{pairs}{total} \t"; - print "non-HiC chimeric mates: $counts{mates}{nonHic_chimera} \n"; + print "non-HiC chimeric mates: $counts{mates}{nonHic_chimera} \n" if($counts{mates}{nonHic_chimera}); + print "non-HiC chimeric mates: 0 \n" if(!$counts{mates}{nonHic_chimera}); } ## omit UU,UR,RU reads next if($psam[7] eq "UU" or $psam[7] eq "UR" or $psam[7] eq "RU"); - next if($psam[7] eq "UU" or $psam[7] eq "UR" or $psam[7] eq "RU"); - + my($a,$b,$c,$d)=parse_psam_reads($psam[8], $psam[9]); my @read1 = @{$a}; my @read2 = @{$b}; my @read3 = @{$c}; my @read4 = @{$d}; - - $counts{pairs}{used_pairs}++; - next if(!@read1 && !@read2); + $counts{pairs}{used_pairs}++; + ## Get rid of all reads that have atleast 2 N bases if(@read1){ my $num_N1 = $read1[9] =~ tr/N//; @@ -171,19 +175,11 @@ if(@read1){ $counts{mates}{used_mates}++; my $cnt = nonHiC_by_LigationSignature(\*OUT, \*OUT1, \*OUT2, \@read1, \@read3, 1); - if($cnt==2){ - print "MAJOR ERROR\n"; - exit 1; - } } ############################# Read2 present if(@read2){ $counts{mates}{used_mates}++; my $cnt = nonHiC_by_LigationSignature(\*OUT, \*OUT1, \*OUT2, \@read2, \@read4, 2); - if($cnt==2){ - print "MAJOR ERROR\n"; - exit 1; - } } undef(@read1); undef(@read2); @@ -191,45 +187,81 @@ undef(@read4); undef(@psam); } +print "Read-pairs: $counts{pairs}{total} \t"; +print "non-HiC chimeric mates: $counts{mates}{nonHic_chimera} \n" if($counts{mates}{nonHic_chimera}); +print "non-HiC chimeric mates: 0 \n" if(!$counts{mates}{nonHic_chimera}); close(IN); -close(OUT); -close(OUT1); +#close(OUT); +#close(OUT1); close(OUT2); ### Remove duplicates -my $z; #$z = remove_duplicates($outprefix."_2.txt",$outprefix."_2dedup.txt"); -$z = remove_duplicates($outprefix."_3.txt",$outprefix."_3dedup.txt"); - - -my $hic = $counts{mates}{used_mates} - $counts{mates}{nonHic_chimera}; - +$counts{mates}{dedups} = remove_duplicates($outprefix."_3.txt",$outprefix."_3dedup.txt"); + +##### Initializations and summary output +$counts{mates}{nonHic_chimera}=0 if(!$counts{mates}{nonHic_chimera}); +$counts{mates}{used_mates}=0 if(!$counts{mates}{used_mates}); +$counts{pairs}{total}=0 if(!$counts{pairs}{total}); +$counts{pairs}{used_pairs}=0 if(!$counts{pairs}{used_pairs}); +$counts{mates}{count_unmapped}=0 if(!$counts{mates}{count_unmapped}); +$counts{mates}{motif_in_unmapped}=0 if(!$counts{mates}{motif_in_unmapped}); +$counts{mates}{motif_spans}=0 if(!$counts{mates}{motif_spans}); +$counts{mates}{motif_hardClip}=0 if(!$counts{mates}{motif_hardClip}); +$counts{mates}{motif_ref}=0 if(!$counts{mates}{motif_ref}); +$counts{mates}{motif_proximity}=0 if(!$counts{mates}{motif_proximity}); +$counts{mates}{double_clip}=0 if(!$counts{mates}{double_clip}); +$counts{mates}{motif_multiple}=0 if(!$counts{mates}{motif_multiple}); +$counts{mates}{nonHic_chimera}=0 if(!$counts{mates}{nonHic_chimera}); +$counts{mates}{dedups}=0 if(!$counts{mates}{dedups}); +$counts{mates}{filtered}=0 if(!$counts{mates}{filtered}); +my $perc = 0; +print "\n\n"; print "###---------------------------Summary------------------------------------------------------------\n"; print "Input psam file: $psam\n"; print "RE enzyme: $re\n"; -print "RE motif: $motif\n"; +print "RE motif: $redb{$re}{motif}\n"; +print "RE motif length (maximum if enzyme combination is used): $redb{$re}{mlen}\n"; print "Input file with genomic location of RE motifs: $sites\n"; +print "Minimum required clipped read sequence $min_clip\n"; +print "Distance between clip-coordinate and RE site $cutoff\n"; print "Total read pairs in the file: $counts{pairs}{total} \n"; -print "Flag Summary\n"; +print "#Flag Summary\n"; foreach my $cl (sort keys %{$counts{flag}}){ - print " $cl: $counts{flag}{$cl}\n"; + $counts{flag}{$cl}=0 if(!$counts{flag}{$cl}); + $perc = round( $counts{flag}{$cl}*100/$counts{pairs}{total},2); + print " >>$cl: $counts{flag}{$cl} ($perc %)\n"; } print "Total non-HiC read pairs used: $counts{pairs}{used_pairs}\n"; print "Total non-HiC mates used: $counts{mates}{used_mates}\n"; -print "Total Hi-C chimeric/unmapped etc. mates: $hic\n"; -print "Total unmapped mates $counts{mates}{count_unmapped}\n"; -print "Total unmapped mates with RE motif $counts{mates}{motif_in_unmapped}\n"; -print "Mates with RE motif at clip junction: $counts{mates}{motif_spans}\n"; -print "Mates with RE motif within unmapped clip sequence: $counts{mates}{motif_hardClip}\n"; -print "Mates with reference RE site annotation: $counts{mates}{motif_ref}\n"; -print "Mates with soft-clipping match at proximity <500bp: $counts{mates}{motif_proximity}\n"; -print "Mates with both 5' and 3' clips (>$min_clip bp): $counts{mates}{double_clip}\n"; -print "Filtered mates with both 5' and 3' clips (>$min_clip bp): $counts{mates}{motif_multiple}\n"; -print "Total non-Hi-C chimeric mates: $counts{mates}{nonHic_chimera}\n"; -print "Total non-Hi-C chimeric mates after duplicate removal: $z \n"; - +my $hic = $counts{mates}{used_mates} - $counts{mates}{nonHic_chimera}; +$perc = round( $hic*100/$counts{mates}{used_mates},2); +print "Total Hi-C chimeric/unmapped/filtered etc. mates: $hic ($perc % of used mates)\n"; +$perc = round( $counts{mates}{filtered}*100/$counts{mates}{used_mates},2); +print "Total Hi-C filtered mates: $counts{mates}{filtered} ($perc % of used mates)\n"; +#$perc = round( $counts{mates}{count_unmapped}*100/$counts{mates}{used_mates},2); +#print "Total unmapped mates $counts{mates}{count_unmapped} ($perc % of used mates)\n"; +$perc = round( $counts{mates}{motif_in_unmapped}*100/$counts{mates}{used_mates},2); +print "Total unmapped mates with RE motif $counts{mates}{motif_in_unmapped} ($perc % of used mates)\n"; +$perc = round( $counts{mates}{motif_spans}*100/$counts{mates}{used_mates},2); +print "Mates with RE motif at clip junction: $counts{mates}{motif_spans} ($perc % of used mates)\n"; +$perc = round( $counts{mates}{motif_hardClip}*100/$counts{mates}{used_mates},2); +print "Mates with RE motif within unmapped clip sequence: $counts{mates}{motif_hardClip} ($perc % of used mates)\n"; +$perc = round( $counts{mates}{motif_ref}*100/$counts{mates}{used_mates},2); +print "Mates with reference RE site annotation: $counts{mates}{motif_ref} ($perc % of used mates)\n"; +$perc = round( $counts{mates}{motif_proximity}*100/$counts{mates}{used_mates},2); +print "Mates with soft-clipping match at proximity <500bp: $counts{mates}{motif_proximity} ($perc % of used mates)\n"; +$perc = round( $counts{mates}{double_clip}*100/$counts{mates}{used_mates},2); +print "Mates with both 5' and 3' clips (>$min_clip bp): $counts{mates}{double_clip} ($perc % of used mates)\n"; +#$perc = round( $counts{mates}{motif_multiple}*100/$counts{mates}{used_mates},2); +#print "Filtered mates with both 5' and 3' clips (>$min_clip bp): $counts{mates}{motif_multiple} ($perc % of used mates)\n"; +$perc = round( $counts{mates}{nonHic_chimera}*100/$counts{mates}{used_mates},2); +print "Total non-Hi-C chimeric mates: $counts{mates}{nonHic_chimera} ($perc % of used mates)\n"; +$perc = round( $counts{mates}{dedups}*100/$counts{mates}{used_mates},2); +print "Total non-Hi-C chimeric mates after duplicate removal: $counts{mates}{dedups} ($perc % of used mates)\n"; +print "\n\n"; ######---------------------------------------------------------------------------------------------------------------- ### subroutines @@ -272,7 +304,7 @@ sub nonHiC_by_LigationSignature { my @read3 = @{$b}; ## uses following global variables - # $len_RE, $min_clip, %chromosomes, $cutoff, $motif, %counts + # $min_clip, %chromosomes, $cutoff, %counts, $re ## Three pronged approach: # 1. Check if RE ligation motif spans clip junction @@ -284,22 +316,22 @@ sub nonHiC_by_LigationSignature { ## Nonmapping reads do not have CIGAR, check if they have RE motif. If not, print them and move on ################## if($read1[5] eq "*"){ - $counts{mates}{count_unmapped}++; - if(re_search($read1[9],$motif,-1)==0){ - #print $fh1 "@","$read1[0]_$readnum 1:N:0:0 $read1[-2]\n$read1[9]\n+\n$read1[10]\n"; - #print $fh2 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$read1[9]\t$read1[10]\t"; - print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$read1[9]\t$read1[10]\t"; - if(@read3){ - #print $fh2 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t-\t-\n"; - print $fh3 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t-\t-\n"; - }else{ - #print $fh2 "-\t-\t-\t-\t-\t-\t-\t-\n"; - print $fh3 "-\t-\t-\t-\t-\t-\t-\t-\n"; - } + $counts{mates}{count_unmapped}++; + if(re_search($read1[9],$re,-1)==0){ + #print $fh1 "@","$read1[0]_$readnum 1:N:0:0 $read1[-2]\n$read1[9]\n+\n$read1[10]\n"; + #print $fh2 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$read1[9]\t$read1[10]\t"; + print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$read1[9]\t$read1[10]\t"; + if(@read3){ + #print $fh2 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t-\t-\n"; + print $fh3 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t-\t-\n"; }else{ - $counts{mates}{motif_in_unmapped}++; + #print $fh2 "-\t-\t-\t-\t-\t-\t-\t-\n"; + print $fh3 "-\t-\t-\t-\t-\t-\t-\t-\n"; } - return(0); #success + }else{ + $counts{mates}{motif_in_unmapped}++; + } + return(0); #success } ################## @@ -308,165 +340,145 @@ sub nonHiC_by_LigationSignature { my @len1 = split (/\D+/,$read1[5]); # storing the length per operation my @ops1 = split (/\d+/,$read1[5]); # storing the operation shift @ops1; # remove the empty first element - return(1) unless (scalar @len1 == scalar @ops1); - return(1) if(!@ops1); - + return(1) if ( !@ops1 or scalar @len1 != scalar @ops1); + ################## ## Non-HiC chimera test ################## - my $motif_len = length($motif); - - my ($seq1, $qual1) = ""; - my $olap = "-"; my $xalign = "-"; - my %localcount; $localcount{motif_spans}=0; $localcount{motif_hardClip}=0; $localcount{motif_ref}=0; $localcount{count_prox}=0; $localcount{nonHic_chimera}=0; - - ## 5' clip of read1 my $is_match_a=0; + my $is_match_b=0; + my $leftoutread = 0; + + ## 5' clip of read1 if($ops1[0] eq "S" && $len1[0] >= $min_clip){ - if(re_search($read1[9],$motif, $len1[0])>0){ - $is_match_a++; - $localcount{motif_spans}++; + $leftoutread=1; + if(re_search($read1[9],$re, $len1[0])>0){ + $is_match_a=1; $localcount{motif_spans}=1; }elsif(RE_within_HardclippedSeq(\@read1,\@len1,\@ops1,\@read3)>0){ - $is_match_a++; - $localcount{motif_hardClip}++; - }elsif(@read3){ + $is_match_a=1; $localcount{motif_hardClip}=1; + }else{ my $x1= $read1[3]; - my $x2= $read3[3]; - if($read3[-1] eq "+"){ $x2 = $read3[3]+length($read3[9]); } my $pos1 = &binarysearch( $x1, $chromosomes{$read1[2]}); - my $pos2 = &binarysearch( $x2, $chromosomes{$read3[2]}); - if( (($pos1-$cutoff)<= $x1 and ($pos1+$motif_len+$cutoff)>=$x1) or (($pos2-$cutoff)<=$x2 and ($pos2+$motif_len+$cutoff)>=$x2 ) ){ - $is_match_a++; - $localcount{motif_ref}++; + if( ($pos1-$cutoff)<= $x1 and ($pos1+$redb{$re}{mlen}+$cutoff)>=$x1) { + $is_match_a=1; $localcount{motif_ref}=1; + } + if($localcount{motif_ref}==0 and $is_match_a == 0 and @read3){ + my $x2= $read3[3]; + if($read3[-1] eq "+"){ $x2 = $read3[3]+length($read3[9]); } + my $pos2 = &binarysearch( $x2, $chromosomes{$read3[2]}); + if( ($pos2-$cutoff)<=$x2 and ($pos2+$redb{$re}{mlen}+$cutoff)>=$x2 ) { + $is_match_a=1; $localcount{motif_ref}=1; + } } } - if($is_match_a==0){ if(linearProximity_bw_mapped_softclip(\@read1,\@read3)==0){ - $localcount{nonHic_chimera}++; - $seq1 = substr($read1[9], 0, $len1[0]); - $qual1 = substr($read1[10], 0, $len1[0]); - - #print $fh1 "@","$read1[0]_$readnum 1:N:0:0 $read1[-2]\n$seq1\n+\n$qual1\n"; - #print $fh2 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$seq1\t$qual1\t"; - if($read1[4]<30){ - print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$read1[9]\t$read1[10]\t"; - }else{ - print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$seq1\t$qual1\t"; - } - if(@read3){ - if($read3[-3] =~ /^XA:Z:/){ - $xalign = $read3[-3]; - } - $olap = find_olap_in_chimera_mapping(\@read1,\@len1,\@ops1,\@read3); - if(!$olap){$olap="-";} - #print $fh2 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t$olap\t$xalign\n"; - print $fh3 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t$olap\t$xalign\n"; - }else{ - #print $fh2 "-\t-\t-\t-\t-\t-\t-\t-\n"; - print $fh3 "-\t-\t-\t-\t-\t-\t-\t-\n"; - } + $localcount{nonHic_chimera}=1; + my $seq1 = substr($read1[9], 0, $len1[0]); + my $qual1 = substr($read1[10], 0, $len1[0]); + #print $fh1 "@","$read1[0]_$readnum 1:N:0:0 $read1[-2]\n$seq1\n+\n$qual1\n"; + #print $fh2 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$seq1\t$qual1\t"; + if($read1[4]<30){ + print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$read1[9]\t$read1[10]\t"; + }else{ + print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$seq1\t$qual1\t"; + } + if(@read3){ + $xalign = $read3[-3] if($read3[-3] =~ /^XA:Z:/); + my $olap = find_olap_in_chimera_mapping(\@read1,\@len1,\@ops1,\@read3); + if(!$olap){$olap="-";} + #print $fh2 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t$olap\t$xalign\n"; + print $fh3 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t$olap\t$xalign\n"; + }else{ + #print $fh2 "-\t-\t-\t-\t-\t-\t-\t-\n"; + print $fh3 "-\t-\t-\t-\t-\t-\t-\t-\n"; + } }else{ - $localcount{count_prox}++; + $localcount{count_prox}=1; } } } - - + ## 3' clip of read1 - my $is_match_b=0; if($ops1[-1] eq "S" && $len1[-1] >= $min_clip){ - if(re_search($read1[9],$motif, (length($read1[9])-$len1[-1]) )>0){ - $is_match_b++; - $localcount{motif_spans}++; + $leftoutread=1; + if(re_search($read1[9],$re, (length($read1[9])-$len1[-1]) )>0){ + $is_match_b=1; $localcount{motif_spans}=1; }elsif(RE_within_HardclippedSeq(\@read1,\@len1,\@ops1,\@read3)>0){ - $is_match_b++; - $localcount{motif_hardClip}++; - }elsif(@read3){ + $is_match_b=1; $localcount{motif_hardClip}=1; + }else{ my $x1= $read1[3]+length($read1[9])-$len1[-1]; - my $x2= $read3[3]; - if($read3[-1] eq "-"){ $x2 = $read3[3]+length($read3[9]); } my $pos1 = &binarysearch( $x1, $chromosomes{$read1[2]}); - my $pos2 = &binarysearch( $x2, $chromosomes{$read3[2]}); - if( (($pos1-$cutoff)<= $x1 and ($pos1+$motif_len+$cutoff)>=$x1) or (($pos2-$cutoff)<=$x2 and ($pos2+$motif_len+$cutoff)>=$x2 )){ - $is_match_b++; - $localcount{motif_ref}++; + if( ($pos1-$cutoff)<= $x1 and ($pos1+$redb{$re}{mlen}+$cutoff)>=$x1) { + $is_match_b=1; $localcount{motif_ref}=1; + } + if($localcount{motif_ref} == 0 and $is_match_b == 0 and @read3){ + my $x2= $read3[3]; + if($read3[-1] eq "-"){ $x2 = $read3[3]+length($read3[9]); } + my $pos2 = &binarysearch( $x2, $chromosomes{$read3[2]}); + if( ($pos2-$cutoff)<=$x2 and ($pos2+$redb{$re}{mlen}+$cutoff)>=$x2 ) { + $is_match_b=1; $localcount{motif_ref}=1; + } } } - if($is_match_b==0){ if(linearProximity_bw_mapped_softclip(\@read1,\@read3)==0){ - $localcount{nonHic_chimera}++; - - $seq1 = substr($read1[9],-$len1[-1],$len1[-1]); - $qual1 = substr($read1[10],-$len1[-1],$len1[-1]); - #print $fh1 "@","$read1[0]_$readnum 1:N:0:0 $read1[-2]\n$seq1\n+\n$qual1\n"; - #print $fh2 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$seq1\t$qual1\t"; - if($read1[4]<30){ - print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$read1[9]\t$read1[10]\t"; - }else{ - print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$seq1\t$qual1\t"; - } - if(@read3){ - if($read3[-3] =~ /^XA:Z:/){ - $xalign = $read3[-3]; - } - $olap = find_olap_in_chimera_mapping(\@read1,\@len1,\@ops1,\@read3); - if(!$olap){$olap="-";} - #print $fh2 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t$olap\t$xalign\n"; - print $fh3 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t$olap\t$xalign\n"; - }else{ - #print $fh2 "-\t-\t-\t-\t-\t-\t-\t-\n"; - print $fh3 "-\t-\t-\t-\t-\t-\t-\t-\n"; - } + $localcount{nonHic_chimera}=1; + my $seq1 = substr($read1[9],-$len1[-1],$len1[-1]); + my $qual1 = substr($read1[10],-$len1[-1],$len1[-1]); + #print $fh1 "@","$read1[0]_$readnum 1:N:0:0 $read1[-2]\n$seq1\n+\n$qual1\n"; + #print $fh2 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$seq1\t$qual1\t"; + if($read1[4]<30){ + print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$read1[9]\t$read1[10]\t"; + }else{ + print $fh3 "$read1[0]_$readnum\t$read1[2]\t$read1[3]\t$read1[-1]\t$read1[4]\t$read1[5]\t$seq1\t$qual1\t"; + } + if(@read3){ + $xalign = $read3[-3] if($read3[-3] =~ /^XA:Z:/); + my $olap = find_olap_in_chimera_mapping(\@read1,\@len1,\@ops1,\@read3); + if(!$olap){$olap="-";} + #print $fh2 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t$olap\t$xalign\n"; + print $fh3 "$read3[2]\t$read3[3]\t$read3[-1]\t$read3[4]\t$read3[5]\t$read3[9]\t$olap\t$xalign\n"; + }else{ + #print $fh2 "-\t-\t-\t-\t-\t-\t-\t-\n"; + print $fh3 "-\t-\t-\t-\t-\t-\t-\t-\n"; + } }else{ - $localcount{count_prox}++; + $localcount{count_prox}=1; } } } - - + + ## counters + my $TOTAL = $localcount{motif_spans}+$localcount{motif_hardClip}+$localcount{motif_ref}+ $localcount{count_prox}+$localcount{nonHic_chimera}; if($ops1[-1] eq "S" && $len1[-1] >= $min_clip && $ops1[0] eq "S" && $len1[0] >= $min_clip){ + $leftoutread=1; $counts{mates}{double_clip}++; } - - ## counters - if($localcount{motif_spans}>0 && $localcount{motif_hardClip}==0 && $localcount{motif_ref}==0 && $localcount{count_prox}==0 && $localcount{nonHic_chimera}==0){ - $counts{mates}{motif_spans}++; - }elsif($localcount{motif_spans}>0 && $localcount{motif_hardClip}==0 && $localcount{motif_ref}==0 && $localcount{count_prox}==0 && $localcount{nonHic_chimera}==0){ + if($TOTAL == 1 and $localcount{motif_spans}>0){ $counts{mates}{motif_spans}++; - }elsif($localcount{motif_spans}==0 && $localcount{motif_hardClip}>0 && $localcount{motif_ref}==0 && $localcount{count_prox}==0 && $localcount{nonHic_chimera}==0){ + }elsif($TOTAL == 1 and $localcount{motif_hardClip}>0){ $counts{mates}{motif_hardClip}++; - }elsif($localcount{motif_spans}==0 && $localcount{motif_hardClip}==0 && $localcount{motif_ref}>0 && $localcount{count_prox}==0 && $localcount{nonHic_chimera}==0){ + }elsif($TOTAL == 1 and $localcount{motif_ref}>0){ $counts{mates}{motif_ref}++; - }elsif($localcount{motif_spans}==0 && $localcount{motif_hardClip}==0 && $localcount{motif_ref}==0 && $localcount{count_prox}>0 && $localcount{nonHic_chimera}==0){ + }elsif($TOTAL == 1 and $localcount{count_prox}>0){ $counts{mates}{motif_proximity}++; - }elsif($localcount{motif_spans}==0 && $localcount{motif_hardClip}==0 && $localcount{motif_ref}==0 && $localcount{count_prox}==0 && $localcount{nonHic_chimera}>0){ - $counts{mates}{nonHic_chimera}++; - }elsif($localcount{nonHic_chimera}>0){ + }elsif($TOTAL == 1 and $localcount{nonHic_chimera}>0){ $counts{mates}{nonHic_chimera}++; - }elsif($localcount{nonHic_chimera}==0 && ($is_match_a>0 and $is_match_b>0)){ + }elsif($localcount{nonHic_chimera}==0 and $is_match_a>0 and $is_match_b>0){ $counts{mates}{motif_multiple}++; } - - - if($localcount{motif_spans}==0 && $localcount{motif_hardClip}==0 && $localcount{motif_ref}==0 && $localcount{count_prox}==0 && $localcount{nonHic_chimera}==0){ - if($is_match_a>0 or $is_match_b>0){ - return(2); - }else{ - return(0); - } - - }else{ - return(0); + if($leftoutread==0){ + $counts{mates}{filtered}++; } + return(0); } sub linearProximity_bw_mapped_softclip { @@ -487,7 +499,6 @@ sub linearProximity_bw_mapped_softclip { return(0); } elsif( abs($read1[3]-$read3[3]) <= 500 && $read1[2] eq $read3[2] ){ - ## Tetracutter cuts every 256bp on an average, hence, minimum resolution 256bp. Any two fragments within 2times the distance are likely to be non-informative return(1); } else{ ## for any other unforseen cases, return 0 @@ -555,50 +566,26 @@ sub find_olap_in_chimera_mapping { #find_olap_in_chimera_mapping(\@read1,\@len1 return($seq); } -sub re_search{ #re_search($seq,$motif,-1) - my ($seq,$motif,$clip) = @_; - ## if CIGAR is unavailable, use clip as -1 +sub re_search{ #re_search($seq,$re,-1) + my ($seq,$re,$cl) = @_; + my $isRE = 0; - if($motif eq "" or !defined $motif){ - print " Motif not defined in Dist2RE_fromSeq sub!! exiting! \n"; + if($redb{$re}{motif} eq ""){ + print " Motif not defined!! exiting! \n"; exit 1; - }elsif($motif eq "GATC"){ ########## DpnII/ Mbo1 - my $c = index($seq, "GATCGATC",0); - return(0) if($c == -1); - return(1) if($c != -1 and $clip == -1); - $clip -=1 if($clip !=0); - if($c <= $clip and ($c+8) >= $clip ){ - return(1); - }else{ - return(0); - } - }elsif($motif eq "AAGCTT"){ ######## HindIII - my $c = index($seq, "AAGCTAGCTT",0); - return(0) if($c == -1); - return(1) if($c != -1 and $clip == -1); - $clip -=1 if($clip !=0); - if($c <= $clip and ($c+10) >= $clip ){ - return(1); - }else{ - return(0); - } } -} - -sub re_search_stringent { #re_search_stringent($seq) - my $seq= shift; - my @patterns; - - if($motif eq "GATC"){ - @patterns = (qr/GATCGATC/); - } - if($motif eq "AAGCTT"){ - @patterns = (qr/AAGCTAGCTT/); + if($cl == -1){ + if($seq=~ m/$redb{$re}{motif}/g){ + $isRE=1; + } + }else{ + while ($seq =~ /$redb{$re}{motif}/g){ + if(($cl-1)>= $-[0] and $cl<= $+[0] and $isRE ==0) { + $isRE = 1; + } + } } - - my $is_match=0; - $is_match++ if( $seq ~~ @patterns ); - return($is_match); + return($isRE); } sub RE_within_HardclippedSeq { #RE_within_HardclippedSeq(\@read1,\@len1,\@ops1,\@read3) @@ -636,8 +623,6 @@ sub RE_within_HardclippedSeq { #RE_within_HardclippedSeq(\@read1,\@len1,\@ops1,\ @ops3 = reverse(@ops3); } - - my $j=0; my $i=0; for($i=0;$i= $len_RE){ - if(re_search_stringent($seq)==0){ + if(length($seq)>= $redb{$re}{mlen}){ + if(re_search($seq,$re,-1)==0){ return(0); }else{ return(1); @@ -683,7 +668,6 @@ sub binarysearch { #binarysearch($pos, \@locations) my $q1=0; my $q2=0; - while ($l <= $u) { $i = int(($l + $u)/2); if ($a->[$i] < $x) { @@ -693,7 +677,6 @@ sub binarysearch { #binarysearch($pos, \@locations) $u = $i-1; } else { - # return $i+1; #original $q1 = abs($a->[$i] - $x); $q2 = $q1; if($a->[$i+1]){ @@ -704,12 +687,6 @@ sub binarysearch { #binarysearch($pos, \@locations) }else{ return($a->[$i]); # found } - #if($upper==1){ - # return($q2); - #}else{ - # return($q1); - #} - } } $q2 = abs($a->[$l-1] - $x); @@ -722,12 +699,6 @@ sub binarysearch { #binarysearch($pos, \@locations) }else{ return($a->[$l-1]); # found } - #if($upper==1){ - # return($q1); - #}else{ - # return($q2); - #} - #return($l); #original } sub flag2binary { #flag2binary($num) @@ -828,4 +799,14 @@ sub parse_psam_reads { #parse_psam_reads($side1,$side2) return(\@r1,\@r2, \@r3, \@r4); } -#####----------------------------------------- END --------------------------- \ No newline at end of file +sub round { + my ($n, $places) = @_; + my $abs = abs $n; + my $val = substr($abs + ('0.' . '0' x $places . '5'), + 0, + length(int($abs)) + + (($places > 0) ? $places + 1 : 0) + ); + ($n < 0) ? "-" . $val : $val; +} +#####----------------------------------------- END ---------------------------