Skip to content

Commit

Permalink
fix: Remove duplicate products on wrong flavours (#10928)
Browse files Browse the repository at this point in the history
Script to remove duplicate products that are on multiple flavors.
Will be used with this file:
https://docs.google.com/spreadsheets/d/1-2WMvUC4J7iRYe3587mHJ1htIxPFyo7JLDLKHVmSum0/edit?gid=1565589772#gid=1565589772

---------

Co-authored-by: Off <off@openfoodfacts.org>
Co-authored-by: Open Food Facts Bot <contact@openfoodfacts.org>
  • Loading branch information
3 people authored Nov 4, 2024
1 parent 09a0c77 commit 2dd39c5
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 55 deletions.
16 changes: 13 additions & 3 deletions scripts/check_products_in_mongodb.pl
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@
my $code_different_than_id = 0;
my $not_normalized_code = 0;
my $invalid = 0;
my $exists_only_in_db = 0;
my $i = 0;

while (my $product_ref = $cursor->next) {

Expand All @@ -150,15 +152,14 @@
$codes_lengths{$code_len}++;

my $to_be_fixed = 0;
my $normalized_code = normalize_code($code);

if ($code ne $productid) {
$code_different_than_id++;
print STDERR "Code different than productid. code: $code - productid: $productid\n";
$to_be_fixed = 1;
}

my $normalized_code = normalize_code($code);
if ($normalized_code eq 'invalid') {
elsif ($normalized_code eq 'invalid') {
$invalid++;
$to_be_fixed = 1;
print STDERR "Invalid code: $code\n";
Expand All @@ -168,6 +169,11 @@
$to_be_fixed = 1;
print STDERR "Not normalized code. code: $code - normalized: $normalized_code\n";
}
elsif (!-e "$data_root/products/$path/product.sto") {
$to_be_fixed = 1;
$exists_only_in_db++;
print STDERR "Product $productid - data_root/products/$path/product.sto does not exist in the filesystem\n";
}

if ($fix and $to_be_fixed) {

Expand Down Expand Up @@ -202,6 +208,9 @@
}

}

$i++;
($i % 1000 == 0) and print STDERR "$i products checked\n";
}

# Print the code lengths
Expand All @@ -212,5 +221,6 @@

print STDERR "Code different than id: $code_different_than_id\n";
print STDERR "Not normalized code: $not_normalized_code\n";
print STDERR "Products that existed only in MongoDB: $exists_only_in_db\n";

exit(0);
52 changes: 0 additions & 52 deletions scripts/migrations/2024_09_detect_duplicate_products.pl

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/perl -w

use Modern::Perl '2017';
use utf8;

use ProductOpener::Config qw/:all/;
use ProductOpener::Products qw/:all/;
use ProductOpener::Paths qw/%BASE_DIRS/;
use ProductOpener::Store qw/retrieve store/;
use ProductOpener::Data qw/get_products_collection/;

use Log::Any::Adapter 'TAP';

my $socket_timeout_ms = 2 * 60000; # 2 mins, instead of 30s default, to not die as easily if mongodb is busy.

my %flavors = ();
my %scans = ();
my %product_names = ();
my %brands = ();
my %flavor_with_most_data = ();
my %flavor_with_most_data_size = ();

foreach my $flavor ("off", "obf", "opf", "opff") {
my $products_collection = get_products_collection({database => $flavor, timeout => $socket_timeout_ms});

my $cursor = $products_collection->query({})
->fields({_id => 1, code => 1, owner => 1, product_name => 1, brands => 1, scans_n => 1});
$cursor->immortal(1);

while (my $product_ref = $cursor->next) {
my $code = $product_ref->{code};
$flavors{all}{$code}++;
$flavors{$flavor}{$code}++;
# Check which flavor has the biggest product file
my $path = product_path($product_ref);
if (not defined $flavor_with_most_data{$code}) {
$flavor_with_most_data{$code} = $flavor;
$flavor_with_most_data_size{$code} = (-s "/srv/$flavor/products/$path/product.sto") || 0;
}
if (((-s "/srv/$flavor/products/$path/product.sto") || 0) > $flavor_with_most_data_size{$code}) {
$flavor_with_most_data{$code} = $flavor;
$flavor_with_most_data_size{$code} = (-s "/srv/$flavor/products/$path/product.sto") || 0;
}
if (($product_ref->{scans_n} || 0) > ($scans{$code} || 0)) {
$scans{$code} = $product_ref->{scans_n} || 0;
}
if (not defined $product_names{$code}) {
$product_names{$code} = $product_ref->{product_name};
}
if (not defined $brands{$code}) {
$brands{$code} = $product_ref->{brands};
}
}
}

foreach my $flavor (keys %flavors) {
print "Flavor $flavor\t" . scalar(keys %{$flavors{$flavor}}) . " products\n";
}

my $d = 0;

open(my $out, ">:encoding(UTF-8)", "/srv/off/html/files/duplicate_products.csv");
print $out
"flavor_with_most_data\tcode\tflavor_with_most_data_size\tproduct_name\tbrands\tscans\toff\tobf\topf\topff\n";

my %urls = (
off => "https://world.openfoodfacts.org",
obf => "https://world.openbeautyfacts.org",
opf => "https://world.openproductsfacts.org",
opff => "https://world.openpetfoodfacts.org",
);

foreach my $code (sort keys %{$flavors{all}}) {
next if $flavors{all}{$code} <= 1;
print $code . "\t"
. $flavor_with_most_data{$code} . "\t"
. $flavor_with_most_data_size{$code} . "\t"
. ($product_names{$code} || '') . "\t"
. ($brands{$code} || '') . "\t"
. ($scans{$code} || 0);
print $out $flavor_with_most_data{$code} . "\t"
. $code . "\t"
. $flavor_with_most_data_size{$code} . "\t"
. ($product_names{$code} || '') . "\t"
. ($brands{$code} || '') . "\t"
. ($scans{$code} || 0);
foreach my $flavor ("off", "obf", "opf", "opff") {
if ($flavors{$flavor}{$code}) {
print "\t" . $flavor . " (" . $flavors{$flavor}{$code} . ")";
print $out "\t" . $urls{$flavor} . "/product/$code";
}
else {
print "\t";
print $out "\t";
}
}
print "\n";
print $out "\n";
$d++;
}

print "\n\n" . $d . " duplicate products\n\n";
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#!/usr/bin/perl -w

# This file is part of Product Opener.
#
# Product Opener
# Copyright (C) 2011-2019 Association Open Food Facts
# Contact: contact@openfoodfacts.org
# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
#
# Product Opener is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

use CGI::Carp qw(fatalsToBrowser);

# use ProductOpener::PerlStandards;
# not available in old versions of ProductOpener running on obf, opf, opff

use ProductOpener::PerlStandards;

use ProductOpener::Config qw/:all/;
use ProductOpener::Store qw/:all/;
use ProductOpener::Tags qw/:all/;
use ProductOpener::Products qw/:all/;
use ProductOpener::Paths qw/:all/;
use ProductOpener::Data qw/:all/;
use ProductOpener::Orgs qw/:all/;
use ProductOpener::Redis qw/:all/;

use CGI qw/:cgi :form escapeHTML/;
use Storable qw/dclone/;
use Getopt::Long;
use File::Copy (qw/move/);

my $usage = <<TXT
Usage:
./2024_10_remove_duplicate_products_on_wrong_flavors.pl --flavor-code-csv [CSV file with flavor and code columns]
This script reads a tab separated CSV file on STDIN
The first column is either empty or contains a flavor: off, obf, opf, or opff
The second column is a barcode
Other columns may be included, they were generated by the 2024_10_detect_duplicate_products_in_different_flavors.pl script
The ouput of the detect duplicate script was reviewed manually to put in the first column which flavor should be kept.
Reviewed file: https://docs.google.com/spreadsheets/d/1-2WMvUC4J7iRYe3587mHJ1htIxPFyo7JLDLKHVmSum0/edit?gid=1565589772#gid=1565589772
The script will move the products that are not kept to the other-flavors-codes directory (for both product and images)
TXT
;

my $csv_file;

GetOptions("flavor-code-csv=s" => \$csv_file,);

if (not defined $csv_file) {
print STDERR $usage;
exit;
}

open(my $log, ">>", "$data_root/logs/remove_duplicate_products_on_wrong_flavors.log");
print $log "remove_duplicate_products_on_wrong_flavors.pl started at " . localtime() . "\n";

my $products_collection = get_products_collection();
my $obsolete_products_collection = get_products_collection({obsolete => 1});

sub move_code_to_other_flavors_codes($code) {

my $product_id = product_id_for_owner(undef, $code);
my $dir = product_path_from_id($product_id);

my $target_dir = $dir;
$target_dir =~ s/[^0-9]//g;

if (move("$data_root/products/$dir", "$data_root/products/other-flavors-codes/$target_dir")) {
print STDERR "moved other flavors code $dir to $data_root/products/other-flavors-codes/$target_dir\n";
print $log "moved other flavors code $dir to $data_root/products/other-flavors-codes/$target_dir\n";
}
else {
print STDERR "could not move other flavors code $dir to $data_root/products/other-flavors-codes/$target_dir\n";
print $log "could not move other flavors code $dir to $data_root/products/other-flavors-codes/$target_dir\n";
}
# Delete from mongodb
my $id = $code;
$products_collection->delete_one({_id => $id});
$obsolete_products_collection->delete_one({_id => $id});

# Also move the image dir if it exists
if (-e "$www_root/images/products/$dir") {
if (move("$www_root/images/products/$dir", "$www_root/images/products/other-flavors-codes/$target_dir")) {
print STDERR
"moved other flavors code $dir images to $www_root/images/products/other-flavors-codes/$target_dir\n";
print $log
"moved other flavors code $dir images to $www_root/images/products/other-flavors-codes/$target_dir\n";
}
else {
print STDERR
"could not move other flavors code $dir images to $www_root/images/products/other-flavors-codes/$target_dir\n";
print $log
"could not move other flavors code $dir images to $www_root/images/products/other-flavors-codes/$target_dir\n";
}
}

return;
}

ensure_dir_created_or_die("$data_root/products/other-flavors-codes");
ensure_dir_created_or_die("$www_root/images/products/other-flavors-codes");

# Open CSV file
open(my $csv_fh, "<", $csv_file) or die "Could not open file $csv_file: $!";

while (my $line = <$csv_fh>) {
chomp($line);
my ($kept_flavor, $code) = split(/\t/, $line);
$code = normalize_code($code);

# Code not numeric? may be header line, skip
if ($code !~ /^\d+$/) {
next;
}

# Undefined flavor, do nothing
if ((not defined $kept_flavor) or ($kept_flavor eq "")) {
next;
}

# Check if the product exists on the current flavor
my $product_ref = retrieve_product(product_id_for_owner(undef, $code), "include_deleted");
if (not defined $product_ref) {
print STDERR "code $code does not exist on the current flavor\n";
next;
}

# Check if the kept flavor is equal to the flavor the script is running on
if ($kept_flavor eq $flavor) {
print STDERR "code $code is on the kept flavor $flavor\n";
}
else {
print STDERR "code $code should be on the kept flavor $kept_flavor instead of $flavor\n";
move_code_to_other_flavors_codes($code);

# Push a deleted event to Redis
push_to_redis_stream("remove-duplicates-bot", $product_ref, "deleted",
"duplicate product: keep product on $kept_flavor, remove from $flavor", undef);
}
}

0 comments on commit 2dd39c5

Please sign in to comment.