Skip to content

Commit

Permalink
feat: script to count product contributions by year (#10957)
Browse files Browse the repository at this point in the history
Sample output on the dev test database:

```
year	active_editors	new_editors	products_edited	products_added	total_products
2012	2	2	1	1	1
2013	5	4	3	2	3
2014	6	4	3	2	5
2015	7	3	6	5	10
2016	6	4	5	3	13
2017	12	7	27	21	34
2018	21	11	55	44	78
2019	26	18	74	40	118
2020	31	17	120	53	171
2021	34	21	92	40	211
2022	36	17	147	67	278
2023	55	30	117	34	312
2024	48	21	96	49	361
all	159	159	361	361	361
```

Output on the whole database:
https://docs.google.com/spreadsheets/d/11nrerWDwzFV1gZb0i16TI_shh9C3XgddYn5KcGl4DO8/edit?gid=0#gid=0

---------

Co-authored-by: Pierre Slamich <pierre@openfoodfacts.org>
  • Loading branch information
stephanegigandet and teolemon authored Nov 15, 2024
1 parent 9a6503a commit ca320be
Showing 1 changed file with 250 additions and 0 deletions.
250 changes: 250 additions & 0 deletions scripts/count_product_contributions_by_year.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
#!/usr/bin/perl -w

# This file is part of Product Opener.
#
# Product Opener
# Copyright (C) 2011-2024 Association Open Food Facts
# Contact: contact@openfoodfacts.org
# Address: 21 rue des Iles, 94100 Saint-Maur des Fossés, France
#
# Product Opener is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

use Modern::Perl '2017';
use utf8;

my $usage = <<TXT
count_product_contributions_by_year.pl - Count products contributions by year
Usage:
count_product_contributions_by_year.pl [--query filters]
Query filters:
--query some_field=some_value (e.g. categories_tags=en:beers) filter the products (--query parameters can be repeated to have multiple filters)
--query some_field=-some_value match products that don't have some_value for some_field
--query some_field=value1,value2 match products that have value1 and value2 for some_field (must be a _tags field)
--query some_field=value1\|value2 match products that have value1 or value2 for some_field (must be a _tags field)
TXT
;

use ProductOpener::Config qw/:all/;
use ProductOpener::Paths qw/%BASE_DIRS/;
use ProductOpener::Store qw/retrieve store/;
use ProductOpener::Index qw/:all/;
use ProductOpener::Display qw/:all/;
use ProductOpener::Tags qw/:all/;
use ProductOpener::Images qw/process_image_crop/;
use ProductOpener::Lang qw/$lc/;
use ProductOpener::Mail qw/:all/;
use ProductOpener::Products qw/:all/;
use ProductOpener::Data qw/get_products_collection/;
use ProductOpener::LoadData qw/load_data/;
use ProductOpener::Redis qw/push_to_redis_stream/;

use CGI qw/:cgi :form escapeHTML/;
use URI::Escape::XS;
use Storable qw/dclone/;
use Encode;
use JSON::MaybeXS;
use Data::DeepAccess qw(deep_get deep_exists deep_set);
use Data::Compare;

use Log::Any::Adapter 'TAP';

use Getopt::Long;

my $query_params_ref = {}; # filters for mongodb query
my $all_owners = '';
my $obsolete = 0;
my $fix = 0;

GetOptions(
"query=s%" => $query_params_ref,
"all-owners" => \$all_owners,
"obsolete" => \$obsolete,
"fix" => \$fix,

) or die("Error in command line arguments:\n\n$usage");

# Get a list of all products
# Use query filters entered using --query categories_tags=en:plant-milks

# Build the mongodb query from the --query parameters
my $query_ref = {};

add_params_to_query($query_params_ref, $query_ref);

# On the producers platform, require --query owners_tags to be set, or the --all-owners field to be set.

if ((defined $server_options{private_products}) and ($server_options{private_products})) {
if ((not $all_owners) and (not defined $query_ref->{owners_tags})) {
print STDERR "On producers platform, --query owners_tags=... or --all-owners must be set.\n";
exit();
}
}

use Data::Dumper;
print STDERR "MongoDB query:\n" . Dumper($query_ref);

my $socket_timeout_ms = 2 * 60000; # 2 mins, instead of 30s default, to not die as easily if mongodb is busy.

# Collection that will be used to iterate products
my $products_collection = get_products_collection({obsolete => $obsolete, timeout => $socket_timeout_ms});

my $current_products_collection = get_products_collection(
{
obsolete => 0,
timeout => 10000
}
);
my $obsolete_products_collection = get_products_collection(
{
obsolete => 1,
timeout => 10000
}
);

my $products_count = "";

eval {
$products_count = $products_collection->count_documents($query_ref);

print STDERR "$products_count documents to check.\n";
};

# only retrieve important fields
my $cursor = $products_collection->query($query_ref)->fields({_id => 1, code => 1, owner => 1});

$cursor->immortal(1);

my %products_edited = ();
my %products_added = ();
my %number_of_products = ();
my %new_editors = ();
my %active_editors = ();
my %editors_first_year = ();

my $i = 0;

while (my $product_ref = $cursor->next) {

my $productid = $product_ref->{_id};
my $code = $product_ref->{code};
my $path = product_path($product_ref);

# Retrieve the changes.sto file
my $changes_ref = retrieve("$data_root/products/$path/changes.sto");
if (defined $changes_ref) {

my $first_change = 1;

# Go through each change
foreach my $change_ref (@{$changes_ref}) {
# Get the timestamp and userid
my $t = $change_ref->{t};
my $userid = $change_ref->{userid} || "openfoodfacts-countributors";
# Get the year
my $year = (localtime($t))[5] + 1900;

# First change: update products_added
if ($first_change) {
deep_set(\%products_added, $year, $code, 1);
deep_set(\%products_added, "all", $code, 1);
$first_change = 0;
}

# Update products_edited
deep_set(\%products_edited, $year, $code, 1);
deep_set(\%products_edited, "all", $code, 1);

# Update the active editors
deep_set(\%active_editors, $year, $userid, 1);
deep_set(\%active_editors, "all", $userid, 1);

# Update the first year of editors if the year is older than the current first year
if (not defined $editors_first_year{$userid}) {
$editors_first_year{$userid} = $year;
}
elsif ($year < $editors_first_year{$userid}) {
$editors_first_year{$userid} = $year;
}
}
}

$i++;
($i % 1000 == 0) and print STDERR "$i products checked\n";
}

# Compute the new editors by year
foreach my $userid (keys %editors_first_year) {
$new_editors{$editors_first_year{$userid}}++;
$new_editors{"all"}++;
}

# Print the active editors by year and for all years

print STDERR "Active editors by year:\n";
foreach my $year (sort keys %active_editors) {
print STDERR "$year: " . (scalar keys %{$active_editors{$year}} || 0) . "\n";
}

# Print the new editors
print STDERR "New editors by year:\n";
foreach my $year (sort keys %new_editors) {
print STDERR "$year: $new_editors{$year}\n";
}

# Print the products added by year
print STDERR "Products added by year:\n";
foreach my $year (sort keys %products_added) {
print STDERR "$year: " . (scalar keys %{$products_added{$year}} || 0) . "\n";
}

# Print the products edited by year
print STDERR "Products edited by year:\n";
foreach my $year (sort keys %products_edited) {
print STDERR "$year: " . (scalar keys %{$products_edited{$year}} || 0) . "\n";
}

# Compute the total number of products by year by summing the new products added in the year and previous years

foreach my $year (sort keys %products_added) {
$number_of_products{$year} = 0;
foreach my $year2 (sort keys %products_added) {
next if $year2 eq "all";
if (($year eq "all") or ($year2 <= $year)) {
$number_of_products{$year} += scalar keys %{$products_added{$year2}};
}
}
}

# Print the total number of products by year
print STDERR "Total number of products by year:\n";
foreach my $year (sort keys %number_of_products) {
print STDERR "$year: $number_of_products{$year}\n";
}

# Print all the stats by year in tab separated columns to STDOUT
print "year\tactive_editors\tnew_editors\tproducts_edited\tproducts_added\ttotal_products\n";
foreach my $year (sort keys %number_of_products) {
print join("\t",
$year, scalar keys %{$active_editors{$year}},
$new_editors{$year},
scalar keys %{$products_edited{$year}},
scalar keys %{$products_added{$year}},
$number_of_products{$year}) . "\n";
}

exit(0);

0 comments on commit ca320be

Please sign in to comment.