From d7431c127b8d77b4d8944a0899db6f2de69d508a Mon Sep 17 00:00:00 2001 From: olaaustine Date: Wed, 26 Jul 2023 16:21:28 +0100 Subject: [PATCH 01/11] Adding check for population sets --- scripts/import/dbSNP_v2/update_new_sets.pl | 79 ++++++++++++++++++---- 1 file changed, 66 insertions(+), 13 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index 41ab1b783..8668a1cfe 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -3,6 +3,7 @@ use DBI; use Socket; use Bio::EnsEMBL::Registry; +use Data::Dumper; use Getopt::Long; use POSIX qw(strftime); use Cwd qw(cwd); @@ -62,15 +63,16 @@ my $chunk = 1000000; my $max_id = $vf->[0]->[1]; -debug($config, "Dumping the variation sets and the new variation feature into files"); -for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { - dump_old_sql_variation_sets($old_dbh, $tmp_num, $chunk, $max_id); -} +#debug($config, "Dumping the variation sets and the new variation feature into files"); +#for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { +# dump_old_sql_variation_sets($old_dbh, $tmp_num, $chunk, $max_id); +#} -for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { - dump_new_variation_feature($dbh, $tmp_num, $chunk, $max_id); -} +#for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { +# dump_new_variation_feature($dbh, $tmp_num, $chunk, $max_id); +#} +=head debug($config, "Sorting the files"); system(sort -u $new_vf_file); system(sort -u $tmp_vset); @@ -89,11 +91,16 @@ dump_new_variation_sets($dbh, $tmp_num, $chunk, $max_id); } +=cut + +debug($config, "Recalculating the variation sets"); +recalculate($tmp_merged, $tmp_vs_file); + debug($config, "Updating the variation feature table"); update_variation_feature_table($dbh, $tmp_vs_file); - +=head debug($config, "Adding failed variation to variation set"); $dbh->do( qq{ INSERT IGNORE INTO variation_set_variation (variation_id, variation_set_id) @@ -105,6 +112,7 @@ ALTER TABLE variation_set_variation ENABLE keys; }) or die "Failed to alter variation_set_variation keys"; +=cut sub temp_table { my $dbhvar = shift; @@ -124,9 +132,9 @@ sub create_merged_file { my $new_vf_file = shift; my $tmp_merged = shift; - open FILE1, "<", "$file" or die "Cannot open $file: $!"; - open FILE2, "<", "$second_file" or die "Cannot open $second_file: $!"; - open OUTPUT, ">", "$third_file" or die "Cannot open $third_file: $!"; + open FILE1, "<", "$tmp_vset" or die "Cannot open $tmp_vset: $!"; + open FILE2, "<", "$new_vf_file" or die "Cannot open $new_vf_file: $!"; + open OUTPUT, ">", "$tmp_merged" or die "Cannot open $tmp_merged: $!"; my %data; while () { @@ -194,8 +202,6 @@ sub update_variation_feature_table { chomp; my $var_id = (split)[0]; my $var_set_id = (split)[1]; - use Data::Dumper; - print Dumper($var_set_id, $var_id); $update_temp_vf->execute($var_set_id, $var_id); # creating a hash which has the var_id has the key and the set var_set_id has the values } @@ -295,6 +301,53 @@ sub dump_new_variation_feature { } +sub get_structure { + my $dbhvar = shift; + + my $get_struc_sth = $dbhvar->prepare(qq[ select variation_set_sub, variation_set_super from variation_set_structure]); + + my %parent; + $get_struc_sth->execute() ||die; + my $dat = $get_struc_sth->fetchall_arrayref(); + foreach my $l(@{$dat}){ + $parent{$l->[0]} = $l->[1] ; + } + return \%parent; + +} + +sub recalculate { + my $input_file = shift; + my $output_file = shift; + + my $parent = get_structure($dbh); + my @sets; + my %concat_sets; + + open FH, "<", "$input_file" or die "Can not open $input_file: $!"; + + while () { + chomp; + my $var_id = split(0); + my $var_set_id = (split)[1]; + push @sets, $var_set_id; + if (exists $parent->{$var_set_id}){ + push @sets, $parent->{$var_set_id}; + push @sets, $parent->{$parent->{$var_set_id}} if exists $parent->{$parent->{$var_set_id}}; + } + push $concat_sets{$var_id} = [\@sets]; + } + + open(my $fh, '>', $output_file) or die "Could not open file '$output_file': $!"; + + # Serialize the hash and write to the file + print $fh Dumper(\%concat_sets); + + # Close the file + close $fh; + close FH; +} + sub usage { die "\n\tUsage: update_new_sets.pl -registry [registry file] -release [release number] \tOptional: -tmp [temp folder] or gets set based on current directory From c20f2125f1d9d355d7705d51eb228412b6c0323d Mon Sep 17 00:00:00 2001 From: olaaustine Date: Wed, 2 Aug 2023 09:52:40 +0100 Subject: [PATCH 02/11] Adding the variation sets to the variation feature table --- scripts/import/dbSNP_v2/update_new_sets.pl | 77 ++++++++++++++-------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index 8668a1cfe..dbf621f18 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -63,16 +63,15 @@ my $chunk = 1000000; my $max_id = $vf->[0]->[1]; -#debug($config, "Dumping the variation sets and the new variation feature into files"); -#for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { -# dump_old_sql_variation_sets($old_dbh, $tmp_num, $chunk, $max_id); -#} +debug($config, "Dumping the variation sets and the new variation feature into files"); +for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { + dump_old_sql_variation_sets($old_dbh, $tmp_num, $chunk, $max_id); +} -#for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { -# dump_new_variation_feature($dbh, $tmp_num, $chunk, $max_id); -#} +for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { + dump_new_variation_feature($dbh, $tmp_num, $chunk, $max_id); +} -=head debug($config, "Sorting the files"); system(sort -u $new_vf_file); system(sort -u $tmp_vset); @@ -91,7 +90,6 @@ dump_new_variation_sets($dbh, $tmp_num, $chunk, $max_id); } -=cut debug($config, "Recalculating the variation sets"); recalculate($tmp_merged, $tmp_vs_file); @@ -100,7 +98,7 @@ update_variation_feature_table($dbh, $tmp_vs_file); -=head + debug($config, "Adding failed variation to variation set"); $dbh->do( qq{ INSERT IGNORE INTO variation_set_variation (variation_id, variation_set_id) @@ -112,7 +110,7 @@ ALTER TABLE variation_set_variation ENABLE keys; }) or die "Failed to alter variation_set_variation keys"; -=cut + sub temp_table { my $dbhvar = shift; @@ -187,23 +185,36 @@ sub load_all_variation_sets { } sub update_variation_feature_table { - # this function after populating the variation_feature_backup table created by inserting from the original table would then update the variation_set_id column uaing the file from the - # dump_new_variation_sets + # this function after populating the variation_feature_backup table created by inserting from the original table would then update the variation_set_id column uaing the file from the recalculate + # using the parents and parents set id to update the variation feature table my $dbhvar = shift; my $load_file = shift; my $update_temp_vf = $dbhvar->prepare(q{ UPDATE variation_feature SET variation_set_id = ? - WHERE variation_id = ? AND variation_set_id = ''}); + WHERE variation_id = ? }); #my %var_data; open FH, "<", "$load_file" or die "Can not open $load_file: $!"; while () { chomp; - my $var_id = (split)[0]; - my $var_set_id = (split)[1]; + my @fields = split("\t"); + my $var_id = $fields[0]; + my $var_set_id = $fields[1]; + - $update_temp_vf->execute($var_set_id, $var_id); # creating a hash which has the var_id has the key and the set var_set_id has the values + my @sets_array; + # to make sure only unique numbers are in the array + foreach my $x (split(',', $var_set_id)){ + push @sets_array, $x if !grep{$_ eq $x}@sets_array; + } + + my @sorted_array = sort { $a<=>$b } @sets_array; + my $values = join(',', @sorted_array); + $values =~ s/\s*,\s*/,/g; # to eliminate spaces and stuff + $values =~ s/^\s+//; #to eliminate spaces and stuff + + $update_temp_vf->execute($values, $var_id); # creating a hash which has the var_id has the key and the set var_set_id has the values } close FH; @@ -321,28 +332,36 @@ sub recalculate { my $output_file = shift; my $parent = get_structure($dbh); - my @sets; my %concat_sets; open FH, "<", "$input_file" or die "Can not open $input_file: $!"; + while () { chomp; - my $var_id = split(0); - my $var_set_id = (split)[1]; - push @sets, $var_set_id; - if (exists $parent->{$var_set_id}){ - push @sets, $parent->{$var_set_id}; - push @sets, $parent->{$parent->{$var_set_id}} if exists $parent->{$parent->{$var_set_id}}; + my @fields = split("\t"); + my $var_id = $fields[0]; + my $var_set_id = $fields[1]; + my @sets; + if (exists $concat_sets{$var_id}) { + $concat_sets{$var_id} = [] unless ref $concat_sets{$var_id} eq 'ARRAY'; + + push @{$concat_sets{$var_id}}, $var_set_id; + push @{$concat_sets{$var_id}}, $parent->{$var_set_id} if exists $parent->{$var_set_id}; #pushing parents and var_set_id + push @{$concat_sets{$var_id}}, $parent->{$parent->{$var_set_id}} if exists $parent->{$parent->{$var_set_id}}; + } else { # if it does not exist, it just creates a new key and an array + $concat_sets{$var_id} = $var_set_id; } - push $concat_sets{$var_id} = [\@sets]; + } - open(my $fh, '>', $output_file) or die "Could not open file '$output_file': $!"; - - # Serialize the hash and write to the file - print $fh Dumper(\%concat_sets); + open(my $fh, '>', $output_file) or die "Could not open file '$output_file': $!"; + foreach my $var_id (keys %concat_sets) { + my $values_str = join(", ", @{$concat_sets{$var_id}}); + print $fh "$var_id\t$values_str\n"; # adding the values str to it + } + # Close the file close $fh; close FH; From f539bdd38cfebcdbc6a68b99e925f40598f0f472 Mon Sep 17 00:00:00 2001 From: olaaustine Date: Thu, 3 Aug 2023 11:32:11 +0100 Subject: [PATCH 03/11] Adding comments --- scripts/import/dbSNP_v2/update_new_sets.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index dbf621f18..9bc50fe9a 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -91,7 +91,7 @@ } -debug($config, "Recalculating the variation sets"); +debug($config, "Recalculating the variation sets"); # takes from the merged file and recalculates and creates the concatenate file that will be used to update variation feature recalculate($tmp_merged, $tmp_vs_file); debug($config, "Updating the variation feature table"); From 50e588120ad56af5418e08bd66906f4015674391 Mon Sep 17 00:00:00 2001 From: olaaustine Date: Tue, 8 Aug 2023 15:57:24 +0100 Subject: [PATCH 04/11] Making changes based on review --- scripts/import/dbSNP_v2/update_new_sets.pl | 102 +++++++-------------- 1 file changed, 35 insertions(+), 67 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index 9bc50fe9a..5732d75b3 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -63,6 +63,7 @@ my $chunk = 1000000; my $max_id = $vf->[0]->[1]; + debug($config, "Dumping the variation sets and the new variation feature into files"); for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { dump_old_sql_variation_sets($old_dbh, $tmp_num, $chunk, $max_id); @@ -85,11 +86,6 @@ debug($config, "Loading new variation sets from merged file"); load_all_variation_sets($dbh, $tmp_merged); -debug($config, "Dumping new variation sets into a file to update the variation feature table"); -for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { - dump_new_variation_sets($dbh, $tmp_num, $chunk, $max_id); -} - debug($config, "Recalculating the variation sets"); # takes from the merged file and recalculates and creates the concatenate file that will be used to update variation feature recalculate($tmp_merged, $tmp_vs_file); @@ -98,7 +94,6 @@ update_variation_feature_table($dbh, $tmp_vs_file); - debug($config, "Adding failed variation to variation set"); $dbh->do( qq{ INSERT IGNORE INTO variation_set_variation (variation_id, variation_set_id) @@ -171,15 +166,15 @@ sub load_all_variation_sets { my $sql = qq{INSERT INTO variation_set_variation (variation_id, variation_set_id ) VALUES (?, ?)}; my $sth = $dbhvar->prepare($sql); - open FH, "<", "$load_file" or die "Can not open $load_file: $!"; - while () { + open my $load_vs, "<", "$load_file" or die "Can not open $load_file: $!"; + while (<$load_vs>) { chomp; my $var_id = (split)[0]; my $var_set_id = (split)[1]; $sth->execute($var_id, $var_set_id); } - close FH; + close $load_vs; $sth->finish(); } @@ -195,8 +190,8 @@ sub update_variation_feature_table { WHERE variation_id = ? }); #my %var_data; - open FH, "<", "$load_file" or die "Can not open $load_file: $!"; - while () { + open my $load_fh, "<", "$load_file" or die "Can not open $load_file: $!"; + while (<$load_fh>) { chomp; my @fields = split("\t"); my $var_id = $fields[0]; @@ -204,11 +199,9 @@ sub update_variation_feature_table { my @sets_array; - # to make sure only unique numbers are in the array - foreach my $x (split(',', $var_set_id)){ - push @sets_array, $x if !grep{$_ eq $x}@sets_array; - } - + + my @sets_array = unique(split(',', $var_set_id)); + my @sorted_array = sort { $a<=>$b } @sets_array; my $values = join(',', @sorted_array); $values =~ s/\s*,\s*/,/g; # to eliminate spaces and stuff @@ -217,37 +210,10 @@ sub update_variation_feature_table { $update_temp_vf->execute($values, $var_id); # creating a hash which has the var_id has the key and the set var_set_id has the values } - close FH; + close $load_fh; $update_temp_vf->finish(); } - -sub dump_new_variation_sets { - # this would dump the variation_sets table from the backup table and create a file vset_concat.txt with two columns which are var_id and sets of variation_set_id - my $dbhvar = shift; - my $tmp_num = shift; - my $chunk = shift; - my $size = shift; - - my $start = $chunk * $tmp_num; - my $end = $chunk + $start; - $end = $end < $size ? $end : $size; - - my $dump_vs = $dbhvar->prepare(qq{ SELECT variation_id, GROUP_CONCAT(DISTINCT(variation_set_id)) FROM variation_set_variation WHERE variation_id > $start AND variation_id <= $end GROUP BY variation_id}); - open (FH, ">>$TMP_DIR/$tmp_vs_file" ) - or die( "Cannot open $TMP_DIR/$tmp_vs_file: $!" ); - $dump_vs->execute(); - - while ( my $aref = $dump_vs->fetchrow_arrayref() ) { - my @a = map {defined($_) ? $_ : '\N'} @$aref; - print FH join("\t", @a), "\n"; - } - - $dump_vs->finish(); - close FH; -} - - sub dump_old_sql_variation_sets { # this would dump old variation_sets table from the old database and create a file called variation_name_set.txt with two columns var_id and variation_set_id my $dbhvar = shift; @@ -264,19 +230,18 @@ sub dump_old_sql_variation_sets { ON v.variation_id = vs.variation_id where variation_set_id != 1 AND v.variation_id > $start AND v.variation_id <= $end }; my $sth = $dbhvar->prepare($sql); - local *FH; - open (FH, ">>$TMP_DIR/$tmp_vset" ) + open (my $dump_fh, ">>$TMP_DIR/$tmp_vset" ) or die( "Cannot open $TMP_DIR/$tmp_vset: $!" ); $sth->execute(); while ( my $aref = $sth->fetchrow_arrayref() ) { my @a = map {defined($_) ? $_ : '\N'} @$aref; - print FH join("\t", @a), "\n"; + print $dump_fh join("\t", @a), "\n"; } $sth->finish(); - close FH; + close $dump_fh; } @@ -295,19 +260,18 @@ sub dump_new_variation_feature { my $sql = qq{SELECT DISTINCT(variation_id), variation_name from variation_feature where variation_id > $start AND variation_id <= $end }; my $sth = $dbh->prepare($sql); - local *FH; - open (FH, ">>$TMP_DIR/$new_vf_file" ) + open (my $dump_var, ">>$TMP_DIR/$new_vf_file" ) or die( "Cannot open $TMP_DIR/$new_vf_file: $!" ); $sth->execute(); while ( my $aref = $sth->fetchrow_arrayref() ) { my @a = map {defined($_) ? $_ : '\N'} @$aref; - print FH join("\t", @a), "\n"; + print $dump_var join("\t", @a), "\n"; } $sth->finish(); - close FH; + close $dump_var; } @@ -334,37 +298,41 @@ sub recalculate { my $parent = get_structure($dbh); my %concat_sets; - open FH, "<", "$input_file" or die "Can not open $input_file: $!"; + open my $re_input, "<", "$input_file" or die "Can not open $input_file: $!"; - while () { + while (<$re_input>) { chomp; my @fields = split("\t"); my $var_id = $fields[0]; my $var_set_id = $fields[1]; my @sets; - if (exists $concat_sets{$var_id}) { - $concat_sets{$var_id} = [] unless ref $concat_sets{$var_id} eq 'ARRAY'; - - push @{$concat_sets{$var_id}}, $var_set_id; - push @{$concat_sets{$var_id}}, $parent->{$var_set_id} if exists $parent->{$var_set_id}; #pushing parents and var_set_id - push @{$concat_sets{$var_id}}, $parent->{$parent->{$var_set_id}} if exists $parent->{$parent->{$var_set_id}}; - } else { # if it does not exist, it just creates a new key and an array - $concat_sets{$var_id} = $var_set_id; - } + + push @{$concat_sets{$var_id}}, $var_set_id; + push @{$concat_sets{$var_id}}, $parent->{$var_set_id} if exists $parent->{$var_set_id}; #pushing parents and var_set_id + push @{$concat_sets{$var_id}}, $parent->{$parent->{$var_set_id}} if exists $parent->{$parent->{$var_set_id}}; } - open(my $fh, '>', $output_file) or die "Could not open file '$output_file': $!"; + open(my $re_output, '>', $output_file) or die "Could not open file '$output_file': $!"; foreach my $var_id (keys %concat_sets) { my $values_str = join(", ", @{$concat_sets{$var_id}}); - print $fh "$var_id\t$values_str\n"; # adding the values str to it + print $re_output "$var_id\t$values_str\n"; # adding the values str to it } # Close the file - close $fh; - close FH; + close $re_output; + close $re_input; +} + +sub unique { + my @array = shift; + + my %u; + map { $u{$_} = 1; } @_; + return keys %u; + } sub usage { From 749652b7013d8afaca4e08c0d0c73aae1a798297 Mon Sep 17 00:00:00 2001 From: olaaustine Date: Tue, 8 Aug 2023 16:40:06 +0100 Subject: [PATCH 05/11] Adding the check for only dbsnp source --- scripts/import/dbSNP_v2/update_new_sets.pl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index 5732d75b3..fe3a96426 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -3,7 +3,6 @@ use DBI; use Socket; use Bio::EnsEMBL::Registry; -use Data::Dumper; use Getopt::Long; use POSIX qw(strftime); use Cwd qw(cwd); @@ -227,7 +226,7 @@ sub dump_old_sql_variation_sets { my $sql = qq{SELECT v.name, vs.variation_set_id from variation_set_variation vs LEFT JOIN variation v - ON v.variation_id = vs.variation_id where variation_set_id != 1 AND v.variation_id > $start AND v.variation_id <= $end }; + ON v.variation_id = vs.variation_id where vs.variation_set_id != 1 AND v.source_id = 1 AND v.variation_id > $start AND v.variation_id <= $end }; my $sth = $dbhvar->prepare($sql); open (my $dump_fh, ">>$TMP_DIR/$tmp_vset" ) From f2018dbbe655db112e936771ab1b80e93920292d Mon Sep 17 00:00:00 2001 From: Ola Austine <83219714+olaaustine@users.noreply.github.com> Date: Fri, 18 Aug 2023 09:54:13 +0100 Subject: [PATCH 06/11] Chunking for variation sets --- scripts/import/dbSNP_v2/update_new_sets.pl | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index fe3a96426..734ab3815 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -181,15 +181,22 @@ sub load_all_variation_sets { sub update_variation_feature_table { # this function after populating the variation_feature_backup table created by inserting from the original table would then update the variation_set_id column uaing the file from the recalculate # using the parents and parents set id to update the variation feature table - my $dbhvar = shift; - my $load_file = shift; + my $dbh = shift; + my $tmp_num = shift; + my $chunk = shift; + my $size = shift; + + my $start = $chunk * $tmp_num; + my $end = $chunk + $start; + $end = $end < $size ? $end : $size; my $update_temp_vf = $dbhvar->prepare(q{ UPDATE variation_feature SET variation_set_id = ? - WHERE variation_id = ? }); + WHERE variation_id = ? AND variation_id > $start AND variation_id <= $end }); #my %var_data; - open my $load_fh, "<", "$load_file" or die "Can not open $load_file: $!"; + + open my $load_fh, "<", "$TMP_DIR/$tmp_vs_file" or die "Can not open $TMP_DIR/$tmp_vs_file: $!"; while (<$load_fh>) { chomp; my @fields = split("\t"); From 0d007ccac9c1e9c33ac1cb694236e41b31f85cc4 Mon Sep 17 00:00:00 2001 From: Ola Austine <83219714+olaaustine@users.noreply.github.com> Date: Fri, 18 Aug 2023 10:13:42 +0100 Subject: [PATCH 07/11] Making final changes --- scripts/import/dbSNP_v2/update_new_sets.pl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index 734ab3815..b6b9ecdab 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -90,8 +90,9 @@ recalculate($tmp_merged, $tmp_vs_file); debug($config, "Updating the variation feature table"); -update_variation_feature_table($dbh, $tmp_vs_file); - +for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { + update_variation_feature_table($dbh, $tmp_num, $chunk, $max_id); +} debug($config, "Adding failed variation to variation set"); $dbh->do( qq{ @@ -196,7 +197,7 @@ sub update_variation_feature_table { #my %var_data; - open my $load_fh, "<", "$TMP_DIR/$tmp_vs_file" or die "Can not open $TMP_DIR/$tmp_vs_file: $!"; + open my $load_fh, "<", "$TMP_DIR/$tmp_vs_file" or die "Can not open $TMP_DIR/$tmp_vs_file`: $!"; while (<$load_fh>) { chomp; my @fields = split("\t"); From 476f0e2e79147f70f74fb9eb9c8af2f0c3458965 Mon Sep 17 00:00:00 2001 From: olaaustine Date: Fri, 18 Aug 2023 13:42:07 +0100 Subject: [PATCH 08/11] Changes after testing to the chunking process --- scripts/import/dbSNP_v2/update_new_sets.pl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index b6b9ecdab..3da5c8e59 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -89,6 +89,8 @@ debug($config, "Recalculating the variation sets"); # takes from the merged file and recalculates and creates the concatenate file that will be used to update variation feature recalculate($tmp_merged, $tmp_vs_file); +=cut + debug($config, "Updating the variation feature table"); for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { update_variation_feature_table($dbh, $tmp_num, $chunk, $max_id); @@ -192,9 +194,10 @@ sub update_variation_feature_table { $end = $end < $size ? $end : $size; - my $update_temp_vf = $dbhvar->prepare(q{ UPDATE variation_feature SET variation_set_id = ? - WHERE variation_id = ? AND variation_id > $start AND variation_id <= $end }); + my $update_sql = qq{ UPDATE variation_feature SET variation_set_id = ? + WHERE variation_id = ? AND variation_id > $start AND variation_id <= $end}; + my $update_temp_vf = $dbh->prepare($update_sql); #my %var_data; open my $load_fh, "<", "$TMP_DIR/$tmp_vs_file" or die "Can not open $TMP_DIR/$tmp_vs_file`: $!"; From 74691a980be2ef5be8e98fc10ba2ae899f952413 Mon Sep 17 00:00:00 2001 From: olaaustine Date: Fri, 18 Aug 2023 16:51:21 +0100 Subject: [PATCH 09/11] Removing cut --- scripts/import/dbSNP_v2/update_new_sets.pl | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index 3da5c8e59..20cb4e72d 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -89,8 +89,6 @@ debug($config, "Recalculating the variation sets"); # takes from the merged file and recalculates and creates the concatenate file that will be used to update variation feature recalculate($tmp_merged, $tmp_vs_file); -=cut - debug($config, "Updating the variation feature table"); for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { update_variation_feature_table($dbh, $tmp_num, $chunk, $max_id); From 213e7b8a3e4e209dc03315555781e023c944e541 Mon Sep 17 00:00:00 2001 From: olaaustine Date: Fri, 18 Aug 2023 17:36:53 +0100 Subject: [PATCH 10/11] Fix for one var_set_id --- scripts/import/dbSNP_v2/update_new_sets.pl | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index 20cb4e72d..3855de515 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -89,6 +89,7 @@ debug($config, "Recalculating the variation sets"); # takes from the merged file and recalculates and creates the concatenate file that will be used to update variation feature recalculate($tmp_merged, $tmp_vs_file); + debug($config, "Updating the variation feature table"); for my $tmp_num (map { $_ } $min_id/$chunk .. $max_id/$chunk) { update_variation_feature_table($dbh, $tmp_num, $chunk, $max_id); @@ -105,7 +106,7 @@ ALTER TABLE variation_set_variation ENABLE keys; }) or die "Failed to alter variation_set_variation keys"; - + sub temp_table { my $dbhvar = shift; @@ -163,7 +164,7 @@ sub load_all_variation_sets { my $dbhvar = shift; my $load_file = shift; - my $sql = qq{INSERT INTO variation_set_variation (variation_id, variation_set_id ) VALUES (?, ?)}; + my $sql = qq{INSERT IGNORE INTO variation_set_variation (variation_id, variation_set_id ) VALUES (?, ?)}; my $sth = $dbhvar->prepare($sql); open my $load_vs, "<", "$load_file" or die "Can not open $load_file: $!"; @@ -207,13 +208,16 @@ sub update_variation_feature_table { my @sets_array; - - my @sets_array = unique(split(',', $var_set_id)); - - my @sorted_array = sort { $a<=>$b } @sets_array; - my $values = join(',', @sorted_array); - $values =~ s/\s*,\s*/,/g; # to eliminate spaces and stuff - $values =~ s/^\s+//; #to eliminate spaces and stuff + my $values; + if ($var_set_id =~ /,/) { + my @sets_array = unique(split(',', $var_set_id)); + my @sorted_array = sort { $a<=>$b } @sets_array; + $values = join(',', @sorted_array); + $values =~ s/\s*,\s*/,/g; # to eliminate spaces and stuff + $values =~ s/^\s+//; #to eliminate spaces and stuff + } else { + $values = "$var_set_id"; + } $update_temp_vf->execute($values, $var_id); # creating a hash which has the var_id has the key and the set var_set_id has the values } From cf58c2ba2cfe7630bb088e4162562bfc3a0fe3da Mon Sep 17 00:00:00 2001 From: olaaustine Date: Fri, 18 Aug 2023 17:38:34 +0100 Subject: [PATCH 11/11] removing the ignore --- scripts/import/dbSNP_v2/update_new_sets.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/import/dbSNP_v2/update_new_sets.pl b/scripts/import/dbSNP_v2/update_new_sets.pl index 3855de515..597550461 100644 --- a/scripts/import/dbSNP_v2/update_new_sets.pl +++ b/scripts/import/dbSNP_v2/update_new_sets.pl @@ -164,7 +164,7 @@ sub load_all_variation_sets { my $dbhvar = shift; my $load_file = shift; - my $sql = qq{INSERT IGNORE INTO variation_set_variation (variation_id, variation_set_id ) VALUES (?, ?)}; + my $sql = qq{INSERT INTO variation_set_variation (variation_id, variation_set_id ) VALUES (?, ?)}; my $sth = $dbhvar->prepare($sql); open my $load_vs, "<", "$load_file" or die "Can not open $load_file: $!";